class LogisticModelBuilder(object):

	def __init__(self):
		self.inter_levels = None
		self.dicts_rep = None
		self.dict_vectorizer = DictVectorizer()
		self.ff_model = None
		self.model = None
		
	def set_data(self, user_atts, inter_atts, responses):
		self.build_data_representations(user_atts, inter_atts)
		# Convert from dict representation into matrix:
		predictor_rows = self.dict_vectorizer.fit_transform(self.dicts_rep).toarray()
		print(predictor_rows)
		print('Finding optimal feature set...')
		self.ff_model = RandomizedLogisticRegression() # Finds best set of features
		# Fit data and get transformed input rows:
		X_new = self.ff_model.fit_transform(predictor_rows, responses)
		print(X_new)
		print('Done! Final Shape: ' + str(X_new.shape))
		print('Building Final model...')		  
		self.model = LogisticRegression().fit(X_new, responses)
		print('Done!')
	
	# Set data based on tuples/rows
	def set_data_rows(self, tuples):
		self.set_data(*ut.unzip(tuples))
	
	# Builds a list-of-dictionaries representation and builds 
	# msg/interaction factor level matrix.
	def build_data_representations(self, user_atts, inter_atts):
		print('Building internal data representations...')
		print('   Building factor level matrix...')
		itp = map(lambda x: set(x), zip(*inter_atts)) # transpose and get row sets
		self.inter_levels = map(lambda x: x if len(filter(lambda y: type(y) == type(''), x)) > 0 else (min(x), max(x)), itp)
		print('   Building dict list representation...')
		self.dicts_rep = dict_list_representation(user_atts, inter_atts)
		print('Done!')	
	
	# Returns a function of form f: X x Y -> P
	# where X = <user_att vals>, Y = <inter. att vals>, and P = P(R = 1)
	def prob_f(self):
		dv = self.dict_vectorizer
		dlr = lambda x, y: dict_list_representation([x], [y])
		ff = self.ff_model
		mod = self.model
		f = lambda X, Y: mod.predict_proba(ff.transform(dv.transform(dlr(X, Y)).toarray()))
		return lambda X, Y: map(lambda z: z[1], f(X, Y))[0]
	
	# Return a vector of interaction attribute levels corresponding to each
	# interaction attribute. For each attribute the following rule is applied:
	# 1) If the attribute is categorical the attribute levels are a list of unique values
	# 2) If the attribute is numeric then a pair (min, max) is returned bounding the values.
	def inter_attr_levels(self):
		return map(lambda lv: lv if type(lv) == type(()) else list(lv), self.inter_levels) 
		
def randomlr(train_x,train_y,cv_x,test_x,regp,alpha=0.5):
    # Create the random forest object which will include all the parameters
    # for the fit
    randomlr = RandomizedLogisticRegression(C=regp,scaling=alpha,fit_intercept=True,sample_fraction=0.75,n_resampling=200)

    # Fit the training data to the Survived labels and create the decision trees
    randomlr = randomlr.fit(train_x,train_y)

    train_x = randomlr.fit_transform(train_x,train_y)
    cv_x = randomlr.transform(cv_x)
    test_x = randomlr.transform(test_x)

    return train_x,cv_x,test_x
Exemple #3
0
def hyperparameterSearch(training_set_path, cat, rl, bu):
    print("Importing descriptors from the training set.")
    X, y, labels = import_descriptors(
        training_set_path, "*_%s_%s_train_descriptors_N20.txt" % (rl, bu))
    print("Number of features: %d." % X.shape[-1])

    print("Scaling data.")
    min_max_scaler = MinMaxScaler()
    X_scale = min_max_scaler.fit_transform(X.todense())

    print("Performing feature selection with randomized logistic regression.")
    # set n_jobs=-1 to parallelize the Randomized Logistic Regression
    # however, there is a bug in the current version of skitlearn (0.18.1) which results in the following message:
    # ValueError: assignment destination is read-only, when parallelizing with n_jobs > 1
    feature_selector = RandomizedLogisticRegression(n_jobs=1)
    X_scale = feature_selector.fit_transform(X_scale, y)
    print("Reduced number of features: %d." % X_scale.shape[-1])

    print(
        "Running randomized hyper-parameter search with Leave-One-Out validation for the RBF kernel."
    )
    param_dist_rbf = {
        'kernel': ['rbf'],
        'C': expon(scale=2000),
        'gamma': expon(scale=.01)
    }
    random_sv_rbf = RandomizedSearchCV(SVC(),
                                       param_distributions=param_dist_rbf,
                                       n_iter=100,
                                       scoring='f1',
                                       cv=LeaveOneGroupOut(),
                                       n_jobs=-1,
                                       error_score=0,
                                       iid=False,
                                       refit=False)
    random_sv_rbf.fit(X_scale, y, groups=labels)

    print(
        "Running randomized hyper-parameter search with Leave-One-Out validation for the linear kernel."
    )
    param_dist_linear = {'C': expon(scale=2000)}
    random_sv_linear = RandomizedSearchCV(
        LinearSVC(),
        param_distributions=param_dist_linear,
        n_iter=100,
        scoring='f1',
        cv=LeaveOneGroupOut(),
        n_jobs=-1,
        error_score=0,
        iid=False,
        refit=False)
    random_sv_linear.fit(X_scale, y, groups=labels)

    print(
        "Running randomized hyper-parameter search with Leave-One-Out validation for the polynomial kernel."
    )
    param_dist_poly = {
        'kernel': ['poly'],
        'C': expon(scale=2000),
        'degree': randint(2, 11),
        'coef0': uniform(loc=-2, scale=4),
        'gamma': expon(scale=.01)
    }
    random_sv_poly = RandomizedSearchCV(SVC(),
                                        param_distributions=param_dist_poly,
                                        n_iter=100,
                                        scoring='f1',
                                        cv=LeaveOneGroupOut(),
                                        n_jobs=-1,
                                        error_score=0,
                                        iid=False,
                                        refit=False)
    random_sv_poly.fit(X_scale, y, groups=labels)

    print(
        "Running randomized hyper-parameter search with Leave-One-Out validation for the sigmoid kernel."
    )
    param_dist_sigmoid = {
        'kernel': ['sigmoid'],
        'C': expon(scale=2000),
        'coef0': uniform(loc=-2, scale=4),
        'gamma': expon(scale=.01)
    }
    random_sv_sigmoid = RandomizedSearchCV(
        SVC(),
        param_distributions=param_dist_sigmoid,
        n_iter=100,
        scoring='f1',
        cv=LeaveOneGroupOut(),
        n_jobs=-1,
        error_score=0,
        iid=False,
        refit=False)
    random_sv_sigmoid.fit(X_scale, y, groups=labels)

    with open(
            "%sbest_parameters_test_%s_%s_%s.txt" %
        (training_set_path, cat, rl, bu), "w") as best_params:

        extracted_features = [
            "%d" % (x + 1) for x in feature_selector.get_support(indices=True)
        ]

        print(
            "Best parameters found on training set with the RBF kernel:\n%s %s"
            % (random_sv_rbf.best_params_, random_sv_rbf.best_score_))
        best_params.write(
            "Best parameters found on training set with the RBF kernel:\n%s %s\n"
            % (random_sv_rbf.best_params_, random_sv_rbf.best_score_))
        print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" %
              (cat, rl, bu, random_sv_rbf.best_params_["kernel"]))
        best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" %
                          (cat, rl, bu, random_sv_rbf.best_params_["kernel"]))
        print("C[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_rbf.best_params_["C"]))
        best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_rbf.best_params_["C"]))
        print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_rbf.best_params_["gamma"]))
        best_params.write("gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_rbf.best_params_["gamma"]))
        print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
              (cat, rl, bu, ", ".join(extracted_features)))
        best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
                          (cat, rl, bu, ", ".join(extracted_features)))
        print("Random LOOCV scores on development set:")
        best_params.write("Random LOOCV scores on development set:\n")
        means = random_sv_rbf.cv_results_['mean_test_score']
        stds = random_sv_rbf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     random_sv_rbf.cv_results_['params']):
            print("%0.5f (stdev %0.5f) for %r" % (mean, std, params))
            best_params.write("%0.5f (stdev %0.5f) for %r\n" %
                              (mean, std, params))

        print(
            "Best parameters found on training set with the linear kernel:\n%s %s"
            % (random_sv_linear.best_params_, random_sv_linear.best_score_))
        best_params.write(
            "Best parameters found on training set with the linear kernel:\n%s %s\n"
            % (random_sv_linear.best_params_, random_sv_linear.best_score_))
        print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" %
              (cat, rl, bu, 'linear'))
        best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" %
                          (cat, rl, bu, 'linear'))
        print("C[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_linear.best_params_["C"]))
        best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_linear.best_params_["C"]))
        print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
              (cat, rl, bu, ", ".join(extracted_features)))
        best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
                          (cat, rl, bu, ", ".join(extracted_features)))
        print("Random LOOCV scores on development set:")
        best_params.write("Random LOOCV scores on development set:\n")
        means = random_sv_linear.cv_results_['mean_test_score']
        stds = random_sv_linear.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     random_sv_linear.cv_results_['params']):
            print("%0.5f (stdev %0.5f) for %r" % (mean, std, params))
            best_params.write("%0.5f (stdev %0.5f) for %r\n" %
                              (mean, std, params))

        print(
            "Best parameters found on training set with the polynomial kernel:\n%s %s"
            % (random_sv_poly.best_params_, random_sv_poly.best_score_))
        best_params.write(
            "Best parameters found on training set with the polynomial kernel:\n%s %s\n"
            % (random_sv_poly.best_params_, random_sv_poly.best_score_))
        print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" %
              (cat, rl, bu, random_sv_poly.best_params_["kernel"]))
        best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["kernel"]))
        print("C[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_poly.best_params_["C"]))
        best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["C"]))
        print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_poly.best_params_["gamma"]))
        best_params.write("gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["gamma"]))
        print("degree[(\"%s\", \"%s\", \"%s\")] = %d" %
              (cat, rl, bu, random_sv_poly.best_params_["degree"]))
        best_params.write("degree[(\"%s\", \"%s\", \"%s\")] = %d\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["degree"]))
        print("coef0[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_poly.best_params_["coef0"]))
        best_params.write("coef0[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["coef0"]))
        print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
              (cat, rl, bu, ", ".join(extracted_features)))
        best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
                          (cat, rl, bu, ", ".join(extracted_features)))
        print("Random LOOCV scores on development set:")
        best_params.write("Random LOOCV scores on development set:\n")
        means = random_sv_poly.cv_results_['mean_test_score']
        stds = random_sv_poly.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     random_sv_poly.cv_results_['params']):
            print("%0.5f (stdev %0.5f) for %r" % (mean, std, params))
            best_params.write("%0.5f (stdev %0.5f) for %r\n" %
                              (mean, std, params))

        print(
            "Best parameters found on training set with the sigmoid kernel:\n%s %s"
            % (random_sv_sigmoid.best_params_, random_sv_sigmoid.best_score_))
        best_params.write(
            "Best parameters found on training set with the sigmoid kernel:\n%s %s\n"
            % (random_sv_sigmoid.best_params_, random_sv_sigmoid.best_score_))
        print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" %
              (cat, rl, bu, random_sv_sigmoid.best_params_["kernel"]))
        best_params.write(
            "\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" %
            (cat, rl, bu, random_sv_sigmoid.best_params_["kernel"]))
        print("C[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_sigmoid.best_params_["C"]))
        best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_sigmoid.best_params_["C"]))
        print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_sigmoid.best_params_["gamma"]))
        best_params.write(
            "gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" %
            (cat, rl, bu, random_sv_sigmoid.best_params_["gamma"]))
        print("coef0[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_sigmoid.best_params_["coef0"]))
        best_params.write(
            "coef0[(\"%s\", \"%s\", \"%s\")] = %f\n" %
            (cat, rl, bu, random_sv_sigmoid.best_params_["coef0"]))
        print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
              (cat, rl, bu, ", ".join(extracted_features)))
        best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
                          (cat, rl, bu, ", ".join(extracted_features)))
        print("Random LOOCV scores on development set:")
        best_params.write("Random LOOCV scores on development set:\n")
        means = random_sv_sigmoid.cv_results_['mean_test_score']
        stds = random_sv_sigmoid.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     random_sv_sigmoid.cv_results_['params']):
            print("%0.5f (stdev %0.5f) for %r" % (mean, std, params))
            best_params.write("%0.5f (stdev %0.5f) for %r\n" %
                              (mean, std, params))
Exemple #4
0
# Useful sources:
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV

from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression  #, LogisticRegressionCV
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
X, y = iris.data, iris.target
print(X)
print(y)
ff_model = RandomizedLogisticRegression()  # Finds best set of features
X_new = ff_model.fit_transform(X, y)  # Fit data and get transformed input rows
print(X_new)
print(X.shape)
print(X_new.shape)
print(X[0:4])
print(ff_model.transform(
    X[0:4]))  # Transform the first 4 rows of data to get only best features
model = LogisticRegression().fit(
    X_new, y)  # Fit logistic regression with best features
print(model.predict_proba(ff_model.transform(
    X[0:4])))  # predict probabilities for first 4 rows of data
print(ff_model.inverse_transform(ff_model.transform(
    X[0:4])))  # Test inverse transforming
arr = np.array([[1, 1, 1]])
print(
    ff_model.inverse_transform(arr)
)  # Get original matrix structure with 1's only in columns of retained features.
Exemple #5
0
def runTest(featmat_train, outcome_train_lbl, featmat_test, outcome_test_lbl,
            sel, paramsDict, bestmodelnum):
    print("Running Test for #{0} ({1})".format(TEST_PERSON_NUM,
                                               TEST_PERSON_DEVICE_ID))
    X_train_allfg = featmat_train.values
    Y_train = outcome_train_lbl.values
    #     Y_train = Y_train.reshape(Y_train.size, 1)# does this help?
    featnames_allfg = featmat_train.columns
    X_test_allfg = featmat_test.values
    Y_test = outcome_test_lbl.values
    Y_true = Y_test[0]
    sel_featnames_per_fg = {}
    sel_featnames_list_ordered = []
    sel_X_train = []
    sel_X_test = []
    countNumSel = 0
    fgi = 0
    for s in suffix_list:
        fgi = fgi + 1
        #    print fgi,
        suffix_list_str = ",".join(s)
        fgidxs = fgColIdxs[suffix_list_str]
        X_train = X_train_allfg[:, fgidxs]
        X_test = X_test_allfg[:, fgidxs]
        featnames_fg = featnames_allfg[fgidxs]
        # continue if empty
        if X_train.shape[1] == 0:
            continue
        ## scaling
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        # variance thresholding
        vartransform = VarianceThreshold()
        X_train = vartransform.fit_transform(X_train)
        X_test = vartransform.transform(X_test)
        varthres_support = vartransform.get_support()
        featnames_fg = featnames_fg[varthres_support]
        ## feature selection
        if sel == "rlog":
            #print (X_train.shape)
            randomized_rlog = RandomizedLogisticRegression(**paramsDict)
            X_train = randomized_rlog.fit_transform(X_train, Y_train)
            X_test = randomized_rlog.transform(X_test)
            chosen_col_idxs = randomized_rlog.get_support()
            #print (len(featnames_fg))
            #print (len(chosen_col_idxs))

            if len(chosen_col_idxs) > 0:
                featnames_fg_chosen = list(featnames_fg[chosen_col_idxs])
                sel_featnames_per_fg[suffix_list_str] = featnames_fg_chosen
                sel_featnames_list_ordered = sel_featnames_list_ordered + featnames_fg_chosen
                sel_X_train.append(X_train)
                sel_X_test.append(X_test)
                countNumSel = countNumSel + len(featnames_fg_chosen)
        else:
            raise ("Unrecognized sel (feature selection algorithm)")
    ## feature selection:  sel{sel{fg1}.....sel{fg45}}
    X_train_concat = np.hstack(sel_X_train)
    X_test_concat = np.hstack(sel_X_test)
    print("\nSum of number of features selected from all fgs = {0}".format(
        countNumSel))
    print("Concatenated X_train has {0} features".format(
        X_train_concat.shape[1]))
    print("Concatenated X_test has {0} features".format(
        X_test_concat.shape[1]))
    if sel == "rlog":
        randomized_rlog = RandomizedLogisticRegression(**paramsDict)
        X_train_concat = randomized_rlog.fit_transform(X_train_concat, Y_train)
        X_test_concat = randomized_rlog.transform(X_test_concat)
        chosen_col_idxs = randomized_rlog.get_support()
        sel_featnames_list_ordered = np.array(sel_featnames_list_ordered)
        chosen_col_idxs = np.array(chosen_col_idxs)
        chosen_cols_final = sel_featnames_list_ordered[chosen_col_idxs]
    else:
        raise ("Unrecognized sel (feature selection algorithm)")
    print("Final number of features in model = {0}".format(
        X_train_concat.shape[1]))
    # GBCT
    if modelname == "GBC":
        clf = GradientBoostingClassifier(random_state=0)
    elif modelname == "LOGR":
        clf = LogisticRegression(random_state=0,
                                 C=paramsDict["C"],
                                 tol=1e-3,
                                 penalty="l1",
                                 n_jobs=paramsDict["n_jobs"],
                                 intercept_scaling=1,
                                 class_weight="balanced")
    else:
        raise ("Unrecognized model name")
    clf.fit(X_train_concat, Y_train)
    pred = clf.predict(X_test_concat)
    pred_proba = clf.predict_proba(X_test_concat)
    Y_pred = pred[0]
    Y_pred_proba = pred_proba[0][1]
    ## Logging test_person_test.csv - outputs 1 line only
    ## did, sel, selParams, Y_pred, Y_pred_proba, Y_true, chosen_cols_final, suffix_list_str : sel_featnames_per_fg[suffix_list_str] in separate columns
    chosen_cols_final_str = ",".join(chosen_cols_final)
    paramsDict_str = ','.join("%s:%r" % (key, val)
                              for (key, val) in paramsDict.iteritems())
    fgIdxs_str = ','.join("%s:%r" % (key, val)
                          for (key, val) in fgIdxs.iteritems())
    cnts_per_lbl_dict = getValueCounts(outcome_train_lbl, outcome_test_lbl)
    cnts_per_lbl_str = ','.join("%s:%r" % (key, val)
                                for (key,
                                     val) in cnts_per_lbl_dict.iteritems())
    dfout = pd.DataFrame({
        "did": [TEST_PERSON_DEVICE_ID],
        "cnts_per_lbl": [cnts_per_lbl_str],
        "sel": [sel],
        "selParams": [paramsDict_str],
        "Y_pred": [Y_pred],
        "Y_pred_proba": [Y_pred_proba],
        "Y_true": [Y_true],
        "fgIdxs": [fgIdxs_str],
        "sel_final": [chosen_cols_final_str]
    })
    dfout = dfout.set_index("did")
    cols = [
        "cnts_per_lbl", "sel", "selParams", "Y_pred", "Y_pred_proba", "Y_true",
        "fgIdxs", "sel_final"
    ]
    for s in suffix_list:
        suffix_list_str = ",".join(s)
        if suffix_list_str in sel_featnames_per_fg:
            sel_feats_fg_str = ",".join(sel_featnames_per_fg[suffix_list_str])
        else:
            sel_feats_fg_str = ""
        dfcol = pd.DataFrame({
            "did": [TEST_PERSON_DEVICE_ID],
            "sel_{0}".format(suffix_list_str): [sel_feats_fg_str]
        })
        dfcol = dfcol.set_index("did")
        dfout = pd.concat([dfout, dfcol], axis=1)
        cols.append("sel_{0}".format(suffix_list_str))
    dfout.to_csv(
        folderpath +
        "{0}_test_model{1}.csv".format(TEST_PERSON_DEVICE_ID, bestmodelnum),
        columns=cols,
        header=True)
    print("{0} minutes elapsed since start of program ".format(
        (time.time() - STARTTIME) / 60.0))
    return (Y_pred, Y_pred_proba)
Exemple #6
0
    Fwe = SelectFwe(alpha=0.01).fit(X, y)
    X = Fwe.transform(X)
    featureNames = featureNames[Fwe.get_support()]
    print("F-test filter ->", X.shape)

    FeatSelection_SVM = True
    FeatSelection_RandLogReg = False

    if FeatSelection_RandLogReg == True:
        LogRegFeats = RandomizedLogisticRegression(C=5,
                                                   scaling=0.5,
                                                   sample_fraction=0.8,
                                                   n_resampling=60,
                                                   selection_threshold=0.2,
                                                   n_jobs=-1)
        X = LogRegFeats.fit_transform(X, y)
        featureNames = featureNames[LogRegFeats.get_support()]
        print("RandomizedLogisticRegression Feature Selection ->:", X.shape)

    elif FeatSelection_SVM == True:
        X = LinearSVC(C=1, penalty="l1", dual=False,
                      class_weight='auto').fit_transform(X, y)
        # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y)
        featureNames = featureNames[LogRegFeats.get_support()]
        print("SVC Transformed X:", X.shape)
    '''
    print("Plot #Feats vs Classification performance:")
    PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100))
    '''

    KFilt = None
#选择标准差超过0.5的特征
large_std_features_index = [
    i for i in range(len(features_std)) if features_std[i] > 0.5
]

X2 = X[:, large_std_features_index]

#第2步:利用Lasso约束下的逻辑回归模型进行变量挑选
#先在验证集上找出最好的参数C
auc_list = []
for Ci in list(range(1, 101)):
    X21, X22, y21, y22 = model_selection.train_test_split(X2, y, test_size=0.2)

    lr = RandomizedLogisticRegression(C=Ci)  # 可在此步对模型进行参数设置
    lr.fit(X21, y21)  # 训练模型,传入X、y, 数据中不能包含miss_value
    X_new = lr.inverse_transform(lr.fit_transform(X21, y21))
    #找出X_new中不全部为0的列
    zero_columns = np.sum(np.abs(X_new), axis=0)
    nonzero_columns_index = [
        i for i in range(len(zero_columns)) if zero_columns[i] > 0.0001
    ]
    X3 = X21[:, nonzero_columns_index]
    lr_best = LogisticRegression()
    lr_best.fit(X21, y21)
    prob_predict = lr_best._predict_proba_lr(X22)[:, 1]
    auc = metrics.auc(y22, prob_predict, reorder=True)
    auc_list.append(auc)

best_C_position = auc_list.index(max(auc_list))
best_C = list(range(1, 101))[best_C_position]
    print "classifier:", cv.std(), cv.mean()
    print "majority base:", accuracy_score(labEnc.transform(labels),
                                           labEnc.transform(maj))
    print "random base:", accuracy_score(labEnc.transform(labels),
                                         labEnc.transform(rand))

if args.coef:
    # Output
    file_basename = args.output

    sel = RandomizedLogisticRegression(n_jobs=10,
                                       n_resampling=args.iterations,
                                       sample_fraction=0.75,
                                       verbose=2)
    new_X = sel.fit_transform(X, enclabels)

    clf = LogisticRegression(class_weight='auto')
    clf.fit(new_X, enclabels)

    # this one does not get the probs
    # selected_feature_names = np.asarray(vectorizer.get_feature_names())[np.flatnonzero(clf.coef_[0])]
    # selected_feature_probs = clf.coef_[0][np.flatnonzero(clf.coef_[0])]

    # this one gets probs, but introduces a mismatch
    # selected_feature_names = np.asarray(vectorizer.get_feature_names())[np.flatnonzero(sel.scores_)]
    # selected_feature_probs = sel.scores_[np.flatnonzero(sel.scores_)]

    # this one works, it seems
    active_feature_mask = sel.get_support()
    selected_feature_names = np.asarray(
# Useful sources:
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV

from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression #, LogisticRegressionCV
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
X, y = iris.data, iris.target
print(X)
print(y)
ff_model = RandomizedLogisticRegression() # Finds best set of features
X_new = ff_model.fit_transform(X, y)  # Fit data and get transformed input rows
print(X_new)
print(X.shape)
print(X_new.shape)
print(X[0:4])
print(ff_model.transform(X[0:4]))  # Transform the first 4 rows of data to get only best features
model = LogisticRegression().fit(X_new, y) # Fit logistic regression with best features
print(model.predict_proba(ff_model.transform(X[0:4]))) # predict probabilities for first 4 rows of data
print(ff_model.inverse_transform(ff_model.transform(X[0:4]))) # Test inverse transforming
arr = np.array([[1,1,1]])
print(ff_model.inverse_transform(arr)) # Get original matrix structure with 1's only in columns of retained features.
Exemple #10
0
    # 'Normalize/Scale features if needed. Our data is standardized by default'
    # X = StandardScaler(copy=False).fit_transform(X)

    Fwe = SelectFwe(alpha=0.01).fit(X,y)
    X=Fwe.transform(X)
    featureNames=featureNames[Fwe.get_support()]
    print("F-test filter ->",X.shape)

    FeatSelection_SVM=True
    FeatSelection_RandLogReg=False

    if FeatSelection_RandLogReg == True:
        LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5,
         sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1)
        X = LogRegFeats.fit_transform(X,y)
        featureNames=featureNames[LogRegFeats.get_support()]
        print("RandomizedLogisticRegression Feature Selection ->:",X.shape)

    elif FeatSelection_SVM == True:
        X= LinearSVC(C=1, penalty="l1", dual=False,class_weight='auto').fit_transform(X, y)
        # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y)
        featureNames=featureNames[LogRegFeats.get_support()]
        print ("SVC Transformed X:",X.shape)

    '''
    print("Plot #Feats vs Classification performance:")
    PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100))
    '''

    KFilt=None