def ridge_dummy_regression(X,y,x_test,lambda_val=None):
	"""
	Train ridge L2 Logistic Regression on X,y. Then predict on x_test
	If lambda_val is provided, will just use this parameter for the L2 LR
	otherwise, will run 5-fold CV on C = log(-1.5, 1.5,5)

	This function returns a list of predicted probabilities as a list
	"""
	from sklearn.linear_model import LogisticRegression
	from sklearn.cross_validation import cross_val_score

	Cs=np.logspace(-1.5, 1.5, 5)
	lr = LogisticRegression(penalty='l2')
	cv_list=list()

	if not lambda_val:
		# Fit ridge to various choices of regularization parameter C to select best C
		for c in Cs:
			lr.C = c
			cv_score = cross_val_score(lr, X, y, scoring='roc_auc', cv=5)
			cv_list.append(np.mean(cv_score))

		print 'Best lambda based on Ridge Cross-Validation...'
		max_score=np.max(cv_list)
		lambda_val=Cs[cv_list.index(max_score)]
		print 1.0/lambda_val, max_score

	# Train LR with the optimized regularization parameter ###
	lr.C = lambda_val
	lr.fit(X,y)
	proba_lst = lr.predict_proba(x_test)[:,1]

	return proba_lst
Example #2
0
def logistic_regression(x_train,y_train,x_test,penalty='L2', regularization=1.0, do_CV=False):
	from sklearn.linear_model import LogisticRegression
	from sklearn.cross_validation import KFold

	### Mean Normalize variables before regression ###
	from sklearn.preprocessing import StandardScaler
	ss=StandardScaler()
	x_train=ss.fit_transform(x_train)
	x_test=ss.fit_transform(x_test)

	lr=LogisticRegression()	
	
	if penalty=='L1':
		lr = LogisticRegression(penalty='l1')
		filename="Lasso_submission.csv"
	else:
		lr = LogisticRegression(penalty='l2')
		filename="Ridge_submission.csv"
	
	if do_CV:
		Cs=np.logspace(-1.5, 1.5, 10)
		cv_list=list()

		### Fit lasso to various choices of regularization parameter C to select optimal C
		for c in Cs:
			lr.C = c
			print 'Running K-fold CV with C = %.5f' % (1.0/c)
			cv_scores=udf.cross_val_score_proba(x_train,y_train,5,lr)
			cv_list.append(np.mean(cv_scores))

		print 'Best lambda based on Cross-Validation...'
		max_score=np.max(cv_list)
		max_lambda=Cs[cv_list.index(max_score)]
		print 1.0/max_lambda, max_score
	else:
		print 'Making prediction with optimal lambda....'
		lr.C=1.0/regularization
		lr.fit(x_train,y_train)
		y_pred=lr.predict_proba(x_test)[:,1]

		print 'Coefficients of the regression:'
		print lr.coef_

		print 'Writing submission file....'
		with open(filename,'wb') as testfile:
			w=csv.writer(testfile)
			w.writerow(('Id','Probability'))
			for i in range(len(y_pred)):
				w.writerow(((i+1),y_pred[i]))
		testfile.close()
		print 'File written to disk...'
Example #3
0
def logisticcv1():
    from sklearn.cross_validation import cross_val_score
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn import datasets, metrics, model_selection
    import numpy as np
    from collections import Counter
    iris = sk.datasets.load_iris()
    iris_train = iris.data
    print(iris_train.shape)
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.33, random_state=42)
    print(X_train.shape, y_train.shape)
    #alphas = np.logspace(-2, -0.5, 30)
    alphas = np.arange(0.4, 0.55, 0.001)
    #logisticcv = LogisticRegressionCV(cv = 10, penalty = 'l1', solver = 'liblinear')
    logistic2 = LogisticRegression(penalty='l1', solver='liblinear')
    scores = list()
    maxscore = 0
    maxa = 0
    for a in alphas:
        #print(a)
        logistic2.C = a
        scores2 = cross_val_score(logistic2, X_train, y_train, cv=10)
        mean1 = scores2.mean()
        print("alpha and score", a, mean1)
        if mean1 > maxscore:
            maxscore = mean1
            maxa = a
        #tup1 = tuple( (a,scores2.mean()))
        #scores.append(tup1)
        #print(scores)
        #print("Below cross_val_score train dataset",scores2, sep = "\n")
        #print("scores2 mean  ", scores2.mean())
    #this is not cv
    #list2 = [ b for b in scores]
    print(maxa, maxscore)
    logistic2.C = 0.5
    c1 = Counter(y_test)
    print(c1)
    logistic2.fit(X_train, y_train)
    pred_values = logistic2.predict(X_test)
    cmatrix = confusion_matrix(y_test, pred_values)
    print("below is confusion matrix on train data set", cmatrix, sep="\n")
    print("coefficieinets  ", logistic2.coef_)
    print("penalty function", logistic2.penalty, sep="  ")
def optimize_Logreg_model(data,
                          selected_features,
                          label_name,
                          C_array,
                          train_years=[2000, 2012],
                          val_years=[2012, 2016]):

    reduced_data = reduce_data(data, label_name)

    X = reduced_data[selected_features]

    y = reduced_data[label_name]

    train_ind = np.array(
        reduced_data[(reduced_data.year >= train_years[0])
                     & ((reduced_data.year < train_years[1]))].index)
    val_ind = np.array(
        reduced_data[(reduced_data.year >= val_years[0])
                     & ((reduced_data.year < val_years[1]))].index)

    y_train = reduced_data[(reduced_data.year >= train_years[0]) & (
        (reduced_data.year < train_years[1]))][label_name]

    n_pos = len(y_train[y_train == True])
    n_neg = len(y_train[y_train == False])

    W_neg = (1.0 / n_neg)
    W_pos = (1.0 / n_pos)

    Weights = {True: W_pos, False: W_neg}

    opt_model = LogisticRegression(C=1.0,
                                   class_weight=Weights,
                                   penalty='l1',
                                   fit_intercept=True,
                                   solver='liblinear',
                                   random_state=0)

    param_grid = {'C': C_array}

    scoring = sklm.make_scorer(positive_fscore)

    cv = ((train_ind, val_ind), )

    GS = ms.GridSearchCV(estimator=opt_model,
                         param_grid=param_grid,
                         cv=cv,
                         scoring=scoring,
                         return_train_score=False,
                         n_jobs=4)
    GS.fit(X, y)

    best_param = GS.best_params_['C']

    opt_model.C = best_param

    test_scores = GS.cv_results_['mean_test_score']

    return opt_model, test_scores
Example #5
0
def logistic_(li_X, li_y):
    X = li_X
    y = li_y
    
    # Logistic Regression
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression()
    clf.fit(X,y)
    clf.coef_
    clf.intercept_
    clf.score(X,y)
    y_pred = clf.predict(X)

    from sklearn.metrics  import recall_score, precision_score, f1_score, accuracy_score 
    
    recall_score(y, y_pred) 
    precision_score(y, y_pred)
    f1_score(y, y_pred)
    accuracy_score(y, y_pred)
    
    from sklearn.model_selection import StratifiedKFold
    k = 5
    skfold = StratifiedKFold(n_splits = k)
    
    accs = []
    recall = []
    precision = []
    f1s = []

    X = X.values
    y = y.values
   
    
    for train_set, valid_set in skfold.split(X,y):
        clf.C = 1
        clf.fit(X[train_set], y[train_set])
        y_pred = clf.predict(X[valid_set])
        acc = accuracy_score(y[valid_set],y_pred)
        r = recall_score(y[valid_set], y_pred)
        p = precision_score(y[valid_set], y_pred)
        f1 = f1_score(y[valid_set], y_pred)
        accs.append(acc)
        recall.append(r)
        precision.append(p)
        f1s.append(f1)
    a = sum(accs)/len(accs)
    b = sum(recall)/len(recall)
    c = sum(precision)/len(precision)
    d = sum(f1s)/len(f1s)
    
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import roc_curve

    y_prob = clf.predict_proba(X) 
    roc_curve(y, y_prob[:,1], pos_label = 1)
    fpr, tpr, thres = roc_curve(y, y_prob[:,1], pos_label = 1)
    
    return a,b,c,d,plt.plot(fpr, tpr),roc_auc_score(y, y_prob[:,1])
def penalty_l2(X, y, l):
    clf = LogisticRegression().set_params(**params)
    clf.C = 1 / l
    clf.penalty = 'l2'
    #scores = cross_val_score(clf, X, y, cv=5)
    #print("cross validation scores of logistic regression model with l2 = {} are :".format(l), scores)
    #print("mean of cross validation scores of logistic regression with l2 = {} model is:".format(l), numpy.mean(scores))
    clf.fit(X, y)
    return clf
Example #7
0
def fit_logistic(X, y, min_c=1e-10, max_c=1e4, num_c=50, num_folds=5, X_holdout=None, model='gboost', min_prob=5e-3):
    class Dummy:
        def __init__(self, const):
            self.const = const

        def predict_proba(self, X):
            p = np.ones(len(X)) * self.const
            return np.array([1-p, p]).T
    if y.sum() == 0:
        if X_holdout is None:
            return np.zeros(len(y)) + min_prob, lambda: np.zeros(len(y), dtype=int) + min_prob, Dummy(min_prob)
        else:
            return np.zeros(X_holdout.shape[0]) + min_prob, lambda: np.zeros(X_holdout.shape[0], dtype=int) + min_prob, Dummy(min_prob)
    if y.sum() == len(y):
        if X_holdout is None:
            return np.ones(len(y)) - min_prob, lambda: np.ones(len(y), dtype=int) - min_prob, Dummy(1-min_prob)
        else:
            return np.ones(X_holdout.shape[0]) - min_prob, lambda: np.ones(X_holdout.shape[0], dtype=int) - min_prob, Dummy(1-min_prob)
    if model == 'lasso':
        from sklearn.linear_model import LogisticRegression
        # Use cross-validation to select lambda
        c_vals = np.exp(np.linspace(np.log(min_c), np.log(max_c), num_c))
        cv_scores = np.zeros(num_c)
        folds = create_folds(X, num_folds)
        for i,fold in enumerate(folds):
            # print '\tFold #{0}'.format(i)
            mask = np.ones(len(X), dtype=bool)
            mask[fold] = False
            X_train, y_train = X[mask], y[mask]
            X_test, y_test = X[~mask], y[~mask]
            if y_train.sum() == 0:
                cv_scores += (1-y_test).sum()
            elif y_train.sum() == len(y_train):
                cv_scores += (y_test).sum()
            else:
                lr = LogisticRegression(penalty='l1', C=min_c, warm_start=True)
                for j,c in enumerate(c_vals):
                    lr.C = c
                    lr.fit(X_train, y_train)
                    cv_scores[j] += lr.predict_log_proba(X_test)[:,y_test].sum()
        cv_scores /= float(len(X))
        best_idx = np.argmax(cv_scores)
        best_c = c_vals[best_idx]
        lr = LogisticRegression(C=best_c)
    elif model == 'gboost':
        from sklearn.ensemble import GradientBoostingClassifier
        lr = GradientBoostingClassifier(subsample=0.5)

    lr.fit(X, y)
    if X_holdout is None:
        probs = lr.predict_proba(X)[:,1]
    else:
        probs = lr.predict_proba(X_holdout)[:,1]

    return probs, lambda: (np.random.random(size=len(probs)) <= probs).astype(int), lr
Example #8
0
def ridge_dummy_regression(X, y, testData, lambda_val=None):
    """
	Train ridge L2 Logistic Regression on X,y. Then predict on x_test
	If lambda_val is provided, will just use this parameter for the L2 LR
	otherwise, will run 5-fold CV on C = log(-1.5, 1.5,5)

	This function returns a list of predicted probabilities as a list
	"""
    from sklearn.linear_model import LogisticRegression
    from sklearn.cross_validation import cross_val_score
    from sklearn.metrics import roc_auc_score

    Cs = np.logspace(-1.5, 1.5, 5)
    lr = LogisticRegression(penalty='l2')
    cv_list = list()

    if not lambda_val:
        # Fit ridge to various choices of regularization parameter C to select best C
        for c in Cs:
            lr.C = c

            ### randomly divide data into 80/20 split ###
            ### because response is very sparse ###
            from sklearn.cross_validation import train_test_split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.20, random_state=42)

            lr.fit(X_train, y_train)
            y_pred = lr.predict_proba(X_test)[:, 1]
            cv_list.append(roc_auc_score(y_test, y_pred))

        print 'Best lambda based on Ridge Cross-Validation...'
        max_score = np.max(cv_list)
        lambda_val = Cs[cv_list.index(max_score)]
        print 1.0 / lambda_val, max_score

    # Train LR with the optimized regularization parameter ###
    lr.C = lambda_val
    lr.fit(X, y)
    proba_lst = lr.predict_proba(testData)[:, 1]

    return proba_lst
Example #9
0
def fit_Logreg_model(data,
                     selected_features,
                     label_name,
                     C_array,
                     n_splits=2,
                     shuffle=True,
                     shuffle_seed=10):
    reduced_data = reduce_data(data, label_name)

    X = reduced_data[selected_features]

    y = reduced_data[label_name]

    n_pos = len(y[y == True])
    n_neg = len(y[y == False])

    W_neg = (1.0 / n_neg) / (1.0 / n_pos + 1.0 / n_neg)
    W_pos = (1.0 / n_pos) / (1.0 / n_pos + 1.0 / n_neg)

    Weights = {True: W_pos, False: W_neg}

    opt_model = LogisticRegression(C=1.0,
                                   class_weight=Weights,
                                   penalty='l1',
                                   fit_intercept=True,
                                   solver='liblinear',
                                   random_state=0)

    param_grid = {'C': C_array}

    scoring = sklm.make_scorer(weighted_fscore)

    cv = ms.KFold(n_splits=n_splits,
                  shuffle=shuffle,
                  random_state=shuffle_seed)

    GS = ms.GridSearchCV(estimator=opt_model,
                         param_grid=param_grid,
                         cv=cv,
                         scoring=scoring,
                         return_train_score=False,
                         n_jobs=4,
                         iid=True)
    GS.fit(X, y)

    best_param = GS.best_params_['C']

    opt_model.C = best_param

    mean_test_scores = GS.cv_results_['mean_test_score']

    std_test_scores = GS.cv_results_['std_test_score']

    return X, y, opt_model, mean_test_scores, std_test_scores
def ridge_dummy_regression(X,y,testData,lambda_val=None):
	"""
	Train ridge L2 Logistic Regression on X,y. Then predict on x_test
	If lambda_val is provided, will just use this parameter for the L2 LR
	otherwise, will run 5-fold CV on C = log(-1.5, 1.5,5)

	This function returns a list of predicted probabilities as a list
	"""
	from sklearn.linear_model import LogisticRegression
	from sklearn.cross_validation import cross_val_score
	from sklearn.metrics import roc_auc_score
	
	Cs=np.logspace(-1.5, 1.5, 5)
	lr = LogisticRegression(penalty='l2')
	cv_list=list()

	if not lambda_val:
		# Fit ridge to various choices of regularization parameter C to select best C
		for c in Cs:
			lr.C = c

			### randomly divide data into 80/20 split ###
			### because response is very sparse ###
			from sklearn.cross_validation import train_test_split
			X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=42)

			lr.fit(X_train,y_train)
			y_pred=lr.predict_proba(X_test)[:,1]
			cv_list.append(roc_auc_score(y_test,y_pred))

		print 'Best lambda based on Ridge Cross-Validation...'
		max_score=np.max(cv_list)
		lambda_val=Cs[cv_list.index(max_score)]
		print 1.0/lambda_val, max_score

	# Train LR with the optimized regularization parameter ###
	lr.C = lambda_val
	lr.fit(X,y)
	proba_lst = lr.predict_proba(testData)[:,1]

	return proba_lst
Example #11
0
def train_logistic(X,y):
    from sklearn.svm import SVC
    from sklearn.grid_search import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    
    clf = LogisticRegression()

    Crange = np.logspace(-8, 1, 8)
    grid = GridSearchCV(clf, param_grid={'C': Crange},scoring='roc_auc', cv=5)
    grid.fit(X, y)
    
    clf.C = grid.best_params_['C']
    clf.fit(X,y)
    return clf
Example #12
0
    def update(self):
        # Handle to storage for model parameters
        params = self._parameters

        print "Starting to update Logistic Regression!"

        # Make sure all my meta data is ready to go
        params.validateMeta()

        observation_vectors = []
        truth_vectors = []

        # Make sure my model data is ready to go
        self._model_data.validate()
        self._model_data.validateViews(self.getMetaData("db_views"))

        # Check my model data
        observation_vectors = self._model_data.getMetaData(
            "observation_vectors")

        truth_vectors = self._model_data.getMetaData("truth_vectors")

        params.setMetaData("db_views", [])

        # Houston we are go
        lr = LogisticRegression()

        lr.penalty = params.getMetaData("penalty")
        lr.dual = params.getMetaData("dual")
        lr.C = params.getMetaData("C")
        lr.fit_intercept = params.getMetaData("fit_intercept")
        lr.intercept_scaling = params.getMetaData("intercept_scaling")
        class_weight = params.getMetaData("class_weight")
        if (class_weight != None):
            lr.class_weight = class_weight
        lr.max_iter = params.getMetaData("max_iter")
        lr.random_state = params.getMetaData("random_state")
        lr.solver = params.getMetaData("solver")
        tol = params.getMetaData("tol")
        if (tol != None):
            lr.tol = tol
        lr.multi_class = params.getMetaData("multi_class")
        lr.verbose = params.getMetaData("verbose")

        # Evaluation mode loads several model artifacts from storage and sets them as inputs
        lr.fit(observation_vectors, truth_vectors)
        params.setBinaryData("lr_model", "application/pickle",
                             pickle.dumps(lr))

        self.finalize()
Example #13
0
def build_model_rbm():
    np.random.seed(12)
    rbm_estimators = list()
    # rbm = BernoulliRBM(random_state=12, verbose=0, n_components=in_dim)
    rbm = BernoulliRBM(random_state=np.random.randint(1, 100), verbose=0)
    lr = LogisticRegression()

    rbm.learning_rate = 0.0001
    # rbm.n_iter = 20
    # rbm.n_components = 50

    lr.C = 10.0

    rbm_estimators.append(('rbm', rbm))
    rbm_estimators.append(('lr', lr))

    return rbm_estimators
Example #14
0
def build_model_rbm():
    np.random.seed(12)
    rbm_estimators = list()
    # rbm = BernoulliRBM(random_state=12, verbose=0, n_components=in_dim)
    rbm = BernoulliRBM(random_state=np.random.randint(1, 100), verbose=0)
    lr = LogisticRegression()

    rbm.learning_rate = 0.0001
    # rbm.n_iter = 20
    # rbm.n_components = 50

    lr.C = 10.0

    rbm_estimators.append(('rbm', rbm))
    rbm_estimators.append(('lr', lr))

    return rbm_estimators
Example #15
0
def HyperSearch():
	# Courtesy of Miroslaw Horbal
	base = [127, 96, 53, 3, 103, 71, 151, 1, 65, 152]
	f = fileio.Preprocessed('../data/quads10Threshold.csv')	
	f.encode(base)
	train, truth = f.transformTrain(base)
	print "Performing hyperparameter selection..."

	clf = LogisticRegression(C=2.3,class_weight='auto')
	# Hyperparameter selection loop
	score_hist = []
	Cvals = np.linspace(1,4,32)
	eval_ = classifier.Classifier(train, truth)
	for C in Cvals:
		clf.C = C
		score = eval_.holdout(clf,nFolds=10,fraction=0.2)
		score_hist.append((score,C))
		print "C: %f Mean AUC: %f" %(C, score)
	bestC = sorted(score_hist)[-1][1]
	print "Best C value: %f" % (bestC)
Example #16
0
def HyperSearch():
    # Courtesy of Miroslaw Horbal
    base = [127, 96, 53, 3, 103, 71, 151, 1, 65, 152]
    f = fileio.Preprocessed('../data/quads10Threshold.csv')
    f.encode(base)
    train, truth = f.transformTrain(base)
    print "Performing hyperparameter selection..."

    clf = LogisticRegression(C=2.3, class_weight='auto')
    # Hyperparameter selection loop
    score_hist = []
    Cvals = np.linspace(1, 4, 32)
    eval_ = classifier.Classifier(train, truth)
    for C in Cvals:
        clf.C = C
        score = eval_.holdout(clf, nFolds=10, fraction=0.2)
        score_hist.append((score, C))
        print "C: %f Mean AUC: %f" % (C, score)
    bestC = sorted(score_hist)[-1][1]
    print "Best C value: %f" % (bestC)
LRs = []

for train, test in cv.split(X, y):
    # clf = LogisticRegression(C=1)
    clf = LogisticRegressionCV()
    clf.fit(X[train], y[train])
    y_pred = clf.predict(X[test])

    scores.append(roc_auc_score(y[test], y_pred))
    coefs.append(clf.coef_)
    Cs.append(clf.C_)
    LRs.append(clf)

lr_mean = LogisticRegression()
lr_mean.coef_ = np.asarray(coefs).mean(axis=0)
lr_mean.C = np.asarray(Cs).mean()
lr_mean.intercept_ = np.asarray([est.intercept_ for est in LRs]).mean()

lr_coef_mean = np.asarray(coefs).mean(axis=0)
lr_coef_std = np.asarray(coefs).std(axis=0)

cv_scores = cross_val_score(lr_mean,
                            X,
                            y,
                            scoring="roc_auc",
                            cv=StratifiedKFold(9))

score_full_X, perm_scores_full_X, pvalue_full_X = permutation_test_score(
    lr_mean,
    X,
    y,
Example #18
0
lcr  = LogisticRegression()
knn  = KNeighborsClassifier(n_neighbors=10)
rrc  = RidgeClassifierCV(normalize=True)
ada  = AdaBoostClassifier()
ada_dct = AdaBoostRegressor(DecisionTreeClassifier(max_depth=2),n_estimators=600, random_state=np.random.RandomState(1))
lda  = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
qda  = QuadraticDiscriminantAnalysis()
rfc  = RandomForestClassifier()                         # Random Forests are gooooood!!
gb   = GradientBoostingClassifier(n_estimators=1000) 
dtr  = DecisionTreeClassifier()
rbm = BernoulliRBM(n_components=2)
logistic = LogisticRegression()
rbm.learning_rate = 0.06
rbm.n_iter = 20
rbm.n_components = 100
logistic.C = 1  
rbm_lcr = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
# --------------------------------------------------------------------- #
# ----------------------------- HMM ----------------------------------- #
# --------------------------------------------------------------------- #n_mix
'''
markov = hmm.GaussianHMM(n_components=3, n_iter=500, init_params="mcst", covariance_type="full")
'''
markov = hmm.GaussianHMM(n_components=3, n_iter=500, params="mcs", init_params="mcs", covariance_type="full")
markov.transmat_ = np.array([[ 0.95354708,  0.04633496,  0.00011796],
                             [ 0.04959727,  0.93909542,  0.01130731],
                             [ 0.05827543,  0.00015793,  0.94156665]])

# We should try to learn parameters of MARKOV TRANSITION MATRIX !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# --------------------------------------------------------------------- #
# ----------------------------- GMM ----------------------------------- #
    def fit(self, X, y, Xstd=None):
        """Fit model to training data.

        Args:
            X (DataFrame): Binarized features with MultiIndex column labels
            y (array): Target variable
            Xstd (DataFrame, optional): Standardized numerical features
        Returns:
            LogisticRuleRegression: Self
        """
        # Initialization
        # Number of samples
        n = X.shape[0]
        if self.init0:
            # Initialize with empty feature indicator and conjunction matrices
            z = pd.DataFrame([], index=X.columns)
            A = np.empty((X.shape[0], 0))
        else:
            # Initialize with X itself i.e. singleton conjunctions
            # Feature indicator and conjunction matrices
            z = pd.DataFrame(np.eye(X.shape[1], dtype=int), index=X.columns)
            # Remove negations
            indPos = X.columns.get_level_values(1).isin(['', '<=', '=='])
            z = z.loc[:,indPos]
            A = X.loc[:,indPos].values
            # Scale conjunction matrix to account for non-uniform penalties
            A = A * self.lambda0 / (self.lambda0 + self.lambda1 * z.sum().values)
        if self.useOrd:
            self.namesOrd = Xstd.columns
            numOrd = Xstd.shape[1]
            # Scale ordinal features to have similar std as "average" binary feature
            Astd = 0.4 * Xstd.values
        # Iteration counter
        self.it = 0
        # Logistic regression object
        lr = LogisticRegression(
            penalty='l1',
            C=1/(n*self.lambda0),
            solver='saga',
            multi_class='ovr',
            max_iter=self.maxSolverIter)

        self.p = y.mean()
        if self.init0:
            # Initial residual
            r = (self.p - y) / n
            # Derivative w.r.t. intercept term
            UB = min(r.sum(), 0)
        else:
            # Fit logistic regression model
            if self.useOrd:
                B = np.concatenate((Astd, A), axis=1)
                lr.fit(B, y)
                # Initial residual
                r = (lr.predict_proba(B)[:,1] - y) / n
            else:
                lr.fit(A, y)
                # Initial residual
                r = (lr.predict_proba(A)[:,1] - y) / n
            # Most "negative" subderivative among current variables (undo scaling)
            UB = -np.abs(np.dot(r, A))
            UB *= (self.lambda0 + self.lambda1 * z.sum().values) / self.lambda0
            UB += self.lambda0 + self.lambda1 * z.sum().values
            UB = min(UB.min(), 0)

        # Beam search for conjunctions with subdifferentials that exclude zero
        vp, zp, Ap = beam_search_K1(r, X, self.lambda0, self.lambda1,
            UB=UB, B=self.B, wLB=self.wLB, eps=self.eps, stopEarly=self.stopEarly)
        vn, zn, An = beam_search_K1(-r, X, self.lambda0, self.lambda1,
            UB=UB, B=self.B, wLB=self.wLB, eps=self.eps, stopEarly=self.stopEarly)
        v = np.append(vp, vn)

        while (v < UB).any() and (self.it < self.iterMax):
            # Subdifferentials excluding zero exist, continue
            self.it += 1
            zNew = pd.concat([zp, zn], axis=1, ignore_index=True)
            Anew = np.concatenate((Ap, An), axis=1)

            # K conjunctions with largest subderivatives in absolute value
            idxLargest = np.argsort(v)[:self.K]
            v = v[idxLargest]
            zNew = zNew.iloc[:,idxLargest]
            Anew = Anew[:,idxLargest]
            # Scale new conjunction matrix to account for non-uniform penalties
            Anew = Anew * self.lambda0 / (self.lambda0 + self.lambda1 * zNew.sum().values)

            # Add to existing conjunctions
            z = pd.concat([z, zNew], axis=1, ignore_index=True)
            A = np.concatenate((A, Anew), axis=1)
            # Fit logistic regression model
            if self.useOrd:
                B = np.concatenate((Astd, A), axis=1)
                lr.fit(B, y)
                # Residual
                r = (lr.predict_proba(B)[:,1] - y) / n
            else:
                lr.fit(A, y)
                # Residual
                r = (lr.predict_proba(A)[:,1] - y) / n

            # Most "negative" subderivative among current variables (undo scaling)
            UB = -np.abs(np.dot(r, A))
            UB *= (self.lambda0 + self.lambda1 * z.sum().values) / self.lambda0
            UB += self.lambda0 + self.lambda1 * z.sum().values
            UB = min(UB.min(), 0)

            # Beam search for conjunctions with subdifferentials that exclude zero
            vp, zp, Ap = beam_search_K1(r, X, self.lambda0, self.lambda1,
                UB=UB, B=self.B, wLB=self.wLB, eps=self.eps, stopEarly=self.stopEarly)
            vn, zn, An = beam_search_K1(-r, X, self.lambda0, self.lambda1,
                UB=UB, B=self.B, wLB=self.wLB, eps=self.eps, stopEarly=self.stopEarly)
            v = np.append(vp, vn)

        # Restrict model to conjunctions with nonzero coefficients
        try:
            idxNonzero = np.where(np.abs(lr.coef_) > self.eps)[1]
            if self.useOrd:
                # Nonzero indices of standardized and rule features
                self.idxNonzeroOrd = idxNonzero[idxNonzero < numOrd]
                nnzOrd = len(self.idxNonzeroOrd)
                idxNonzeroRules = idxNonzero[idxNonzero >= numOrd] - numOrd
                if self.debias and len(idxNonzero):
                    # Re-fit logistic regression model with effectively no regularization
                    z = z.iloc[:,idxNonzeroRules]
                    lr.C = 1 / self.eps
                    lr.fit(B[:,idxNonzero], y)
                    idxNonzero = np.where(np.abs(lr.coef_) > self.eps)[1]
                    # Nonzero indices of standardized and rule features
                    idxNonzeroOrd2 = idxNonzero[idxNonzero < nnzOrd]
                    self.idxNonzeroOrd = self.idxNonzeroOrd[idxNonzeroOrd2]
                    idxNonzeroRules = idxNonzero[idxNonzero >= nnzOrd] - nnzOrd
                self.z = z.iloc[:,idxNonzeroRules]
                lr.coef_ = lr.coef_[:,idxNonzero]
            else:
                if self.debias and len(idxNonzero):
                    # Re-fit logistic regression model with effectively no regularization
                    z = z.iloc[:,idxNonzero]
                    lr.C = 1 / self.eps
                    lr.fit(A[:,idxNonzero], y)
                    idxNonzero = np.where(np.abs(lr.coef_) > self.eps)[1]
                self.z = z.iloc[:,idxNonzero]
                lr.coef_ = lr.coef_[:,idxNonzero]
        except AttributeError:
            # Model has no coefficients except intercept
            self.z = z
        self.lr = lr
Example #20
0
scores = np.zeros((1, len(solvers), len(C_values)), dtype=np.float)

log_reg.penalty = 'l2'
log_reg.max_iter = 6000

start_time = time.time()
for i, dataset in enumerate(data):
    for j, solver in enumerate(solvers):
        for l, c in enumerate(C_values):
            # Since each dataset has the same order of data, just with different preprocessing, we can use the same k-folds for
            # each variant of the data (same applies for labels)
            folds = cv_folds.split(int_train_data, y_train)

            # Changing hyperparameter for this run
            log_reg.solver = solver
            log_reg.C = c

            lap = time.time() - start_time
            print("Start time: %d m %.2f s" % (int(lap / 60), lap -
                                               (int(lap / 60) * 60)))
            print("Training logisitic regression model on %s with solver: %s" %
                  (dataset, solver))
            with warnings.catch_warnings(record=True) as w:
                # Cause all warnings to always be triggered.
                warnings.simplefilter("always")

                cross_val_scores = np.zeros(num_folds, dtype=np.float)
                # K fold cross-validation, saving the score for each version of the model in scores
                for k, (train, val) in enumerate(folds):
                    log_reg.fit(np.take(data[dataset], train, axis=0),
                                np.take(y_train, train))
Example #21
0
    a=auc_score(labels2[te],lr.predict_proba(tr2[te])[:,1])
    print('added %s auc: %3f'%(collab[i],a))
    aucholder[i]=a
    
# successively add best features (first 20)
tr2=trdata[:,(f527col,f528col)]
sortedfeatinds = aucholder.argsort()[::-1] # best first
Cs=np.logspace(0,10,5)
bestc=np.zeros(20)
besta=np.zeros(20)
for i in range(20):
    toadd=sortedfeatinds[i]
    tr2=np.append(tr2,trdata[:,toadd].reshape(nrows,1),axis=1)
    print('adding feature: %s number: %d'%(collab[toadd],i))
    for c in Cs:
        lr.C=c
        lr.fit(tr2[tr],labels2[tr])
        a=auc_score(labels2[te],lr.predict_proba(tr2[te])[:,1])
        print('AUC: %3f with C: %f'%(a,c))
        if a>besta[i]:
            bestc[i]=c
            besta[i]=a

# best score of all:
bestoveralla=besta.argsort()[-1]
bestcforbesta=bestc[bestoveralla]
print('best auc of %3f with %d features added and C=%f'%(besta.max(),(bestoveralla+1),bestcforbesta))

bestfeats = [f527col,f528col]+sortedfeatinds[:(bestoveralla+1)].tolist()

trfinal=trdata[:,tuple(bestfeats)]
def makePrediction(training_date, days):
    #training_date = datetime.datetime(2014,1,22)
    #title = 'scores/accuracy_' + str(days) + '.txt'
    #print title
    #text_file = open(title, "w")

    snpret = create_lagged_series("NDAQ",
                                  training_date,
                                  datetime.datetime(2015, 5, 26),
                                  lags=5)
    #print snpret
    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"]]
    y = snpret["Direction"]

    # The test data is split into two parts: Before and after 1st Jan 2005.
    start_test = datetime.datetime(2015, 3, 17)
    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]

    # Create prediction DataFrame
    pred = pd.DataFrame(index=y_test.index)
    #print pred
    pred["Actual"] = y_test
    # Create and fit the three models
    print "Hit Rates:"
    models = [("Linear", linear_model.LinearRegression()),
              ("LR", LogisticRegression()),
              ("KNN", neighbors.KNeighborsClassifier(n_neighbors=3)),
              ("SVM", SVC(C=10)),
              ("RF", RandomForestClassifier(n_estimators=4))]
    for m in models:
        fit_model(m[0], m[1], X_train, y_train, X_test, pred)

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    rbm.learning_rate = .06
    rbm.n_iter = 15
    rbm.n_components = 100
    logistic.C = 6000
    classifier.fit(X_train, y_train)
    logistic_classifier = LogisticRegression(C=100.0)
    logistic_classifier.fit(X_train, y_train)
    score = classifier.score(X_train, y_train)
    print score
    text_file.write('Neural Network : ' + str(score) + '\n')

    # 100 Days
    text_file.write('100 Days Prediction Accuracies\n')
    snpret = create_lagged_series("NDAQ",
                                  training_date,
                                  datetime.datetime(2015, 8, 6),
                                  lags=5)
    #print snpret
    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"]]
    y = snpret["Direction"]

    # The test data is split into two parts: Before and after 1st Jan 2005.
    start_test = datetime.datetime(2015, 3, 17)
    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]

    # Create prediction DataFrame
    pred = pd.DataFrame(index=y_test.index)
    #print pred
    pred["Actual"] = y_test
    # Create and fit the three models
    print "Hit Rates:"
    models = [("Linear", linear_model.LinearRegression()),
              ("LR", LogisticRegression()),
              ("KNN", neighbors.KNeighborsClassifier(n_neighbors=3)),
              ("SVM", SVC(C=10)),
              ("RF", RandomForestClassifier(n_estimators=4))]
    for m in models:
        fit_model(m[0], m[1], X_train, y_train, X_test, pred)

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    rbm.learning_rate = .06
    rbm.n_iter = 15
    rbm.n_components = 100
    logistic.C = 6000
    classifier.fit(X_train, y_train)
    logistic_classifier = LogisticRegression(C=100.0)
    logistic_classifier.fit(X_train, y_train)
    score = classifier.score(X_train, y_train)
    print score
    text_file.write('Neural Network : ' + str(score) + '\n')

    # 200 Days
    text_file.write('200 Days Prediction Accuracies\n')
    snpret = create_lagged_series("NDAQ",
                                  training_date,
                                  datetime.datetime(2015, 12, 31),
                                  lags=5)
    #print snpret
    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"]]
    y = snpret["Direction"]

    # The test data is split into two parts: Before and after 1st Jan 2005.
    start_test = datetime.datetime(2015, 3, 17)
    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]

    # Create prediction DataFrame
    pred = pd.DataFrame(index=y_test.index)
    #print pred
    pred["Actual"] = y_test
    # Create and fit the three models
    print "Hit Rates:"
    models = [("Linear", linear_model.LinearRegression()),
              ("LR", LogisticRegression()),
              ("KNN", neighbors.KNeighborsClassifier(n_neighbors=3)),
              ("SVM", SVC(C=10)),
              ("RF", RandomForestClassifier(n_estimators=4))]
    for m in models:
        fit_model(m[0], m[1], X_train, y_train, X_test, pred)

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    rbm.learning_rate = .06
    rbm.n_iter = 15
    rbm.n_components = 100
    logistic.C = 6000
    classifier.fit(X_train, y_train)
    logistic_classifier = LogisticRegression(C=100.0)
    logistic_classifier.fit(X_train, y_train)
    score = classifier.score(X_train, y_train)
    print score
    text_file.write('Neural Network : ' + str(score))
    text_file.close()
Example #23
0
    print "Hit Rates:"
    models = [("Linear", linear_model.LinearRegression()),
              ("LR", LogisticRegression()),
              ("KNN", neighbors.KNeighborsClassifier(n_neighbors=2)),
              ("SVM", SVC(C=1)),
              ("RF", RandomForestClassifier(n_estimators=1))]
    for m in models:
        fit_model(m[0], m[1], X_train, y_train, X_test, pred)

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    rbm.learning_rate = .06
    rbm.n_iter = 20
    rbm.n_components = 100
    logistic.C = 6000
    classifier.fit(X_train, y_train)
    logistic_classifier = LogisticRegression(C=100.0)
    logistic_classifier.fit(X_train, y_train)
    score = classifier.score(X_train, y_train)
    print score
    text_file.write('Neural Network : ' + str(score) + '\n')

    # 100 Days
    text_file.write('100 Days Prediction Accuracies\n')
    snpret = create_lagged_series("NDAQ",
                                  training_date,
                                  datetime.datetime(2015, 8, 6),
                                  lags=5)
    #print snpret
    # Use the prior two days of returns as predictor values, with direction as the response
Example #24
0
X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
    train_index], y[test_index]

from sklearn.preprocessing import MinMaxScaler

scale = MinMaxScaler()
scale.fit(X_train)

X_train = scale.transform(X_train)
X_test = scale.transform(X_test)

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.C = 1
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

C_range = [1e-5, 1e-3, 1e-2, 1, 1e2, 1e5, 1e10]

C_range_exp = np.arange(-15.0, 21.0)

C_range = 10**C_range_exp

from sklearn.model_selection import GridSearchCV

param = {'C': C_range}

gs = GridSearchCV(clf, param)
gs.fit(X_train, y_train)
Example #25
0
# -*- coding: utf-8 -*-
import scipy.io
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import matplotlib.pyplot as plt
#from sklearn.model_selection import GridSearchCV

C_range = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
scores = []
parameters = {'C':C_range}

mat = scipy.io.loadmat("./input/arcene.mat")
y_train = np.ravel(mat["y_train"])
y_test = np.ravel(mat["y_test"])
X_train = mat["X_train"]
X_test = mat["X_test"]

model = LogisticRegression(penalty='l1')

for C in C_range:
    model.C = C
    model.penalty = 'l1'
    model.fit(X_train, y_train)
    model.coef_
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    print("Cls with penalty l1 and C: " + str(C) + "'. Score: " + str(score))
Example #26
0
arcene = loadmat('arcene.mat')

X_train = arcene['X_train']
X_test = arcene['X_test']
y_train = arcene['y_train']
y_test = arcene['y_test']

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

clf = LogisticRegression(penalty='l1')
C_range = 10.0**np.arange(0, 5)

scores = []
for C in C_range:
    clf.C = C

    #selector = RFECV(clf, step=50 , verbose=1)
    # selector.fit(X_train, y_train)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)

clf.C = C_range[np.argmax(scores)]
clf.fit(X_train, y_train)

#selector = RFECV(clf, step=50 , verbose=1)
#selector.fit(X_train, y_train)
'''count = 0
for x in clf.coef_:
X_new2.shape
#%% Seperate train and test data
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=0)
for train_index, test_index in sss.split(X_new2, Y):
    X_train, X_test = X_new2[train_index], X_new2[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

#%%
## Logistic Regression with Lasso(CV)
Cs = np.logspace(-8, -2, 10)
scores = []
clf_l1_LR = LogisticRegression(penalty='l1', solver='liblinear')
for C in Cs:
    clf_l1_LR.C = C
    K = 3
    kf = KFold(n_splits=K)
    score = []
    for train, test in kf.split(X_train, Y_train):
        clf_l1_LR.fit(X_train[train], Y_train[train])
        y_pred = clf_l1_LR.predict_proba(X_train[test])
        score.append(brier_score_loss(Y_train[test], y_pred[:, 1]))
    scores.append(np.mean(score))
C_optim_l1 = Cs[scores.index(min(scores))]
clf_l1_LR.C = C_optim_l1
clf_l1_LR.fit(X_train, Y_train)

## Visualize the average scores for different shrinkage parameters
plt.plot(np.log10(Cs), scores)
Example #28
0
warnings.filterwarnings('ignore')

clf = LogisticRegression(penalty='l1')

C_array = 10**np.arange(0, 15, 0.5)

mat = loadmat('arcene\\arcene.mat')
#print(mat.keys())
X_test = mat['X_test']
X_train = mat['X_train']
y_test = mat['y_test'].ravel()
y_train = mat['y_train'].ravel()

opt_c = 0
score = 0
for c in C_array:
    clf.C = c
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    #print('# of selected features:', clf.coef_)
    if np.mean(cross_val_score(clf, X_test, y_test)) > score:
        score = np.mean(cross_val_score(clf, X_test, y_test))
        opt_c = c

print('Max score:', score, 'C-value:', opt_c)

clf.C = opt_c
clf.fit(X_train, y_train)
f_selected = np.count_nonzero(clf.coef_)
print('Features selected:', f_selected)
print('Accuracy score:', np.mean(cross_val_score(clf, X_test, y_test)))
Example #29
0
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

data = loadmat('arcene.mat')

x_train = data['X_train']
x_test = data['X_test']
y_train = data['y_train'].ravel()
y_test = data['y_test'].ravel()

clf = LogisticRegression(penalty='l1',random_state=0)

C_range = []
for i in range(-10,10):
    C_range.append(float('10e'+str(i)))

C_range = np.asarray(C_range)


clf.fit(x_train,y_train)

for C in C_range:

    clf.C = C
    clf.fit(x_train, y_train)

    print('Number of selected features', np.count_nonzero(clf.coef_))
    predict = clf.predict(x_test)
    print('Accuracy: ', accuracy_score(y_test, predict))
    input('C value: ' + str(C))
Example #30
0
mat = loadmat("arcene")
print(mat.keys())

#load dictonary data in arrays
p_test = mat["X_test"]
p_train = mat["X_train"]
q_test = mat["y_test"].ravel()
q_train = mat["y_train"].ravel()

C_range = 10.0**np.arange(-4, 3)
clf = LR()

accuracy_mat = []
weights = []
for C in C_range:
    clf.C = C
    clf.fit(p_train, q_train)
    pred = clf.predict(p_test)
    accuracy = 100.0 * np.mean(pred == q_test)
    accuracy_mat.append(accuracy)
    weights.append(clf.coef_)
indices = np.argsort(accuracy_mat)[::-1]
print("Accuracy for C = %.2e is %.1f %% (||w|| = %.4f)" % \
      (C_range[indices[:10]],
       accuracy[indices[:10]],
       np.linalg.norm(weights[indices[:10]])))

#plot sparseness
plt.figure()
plt.title("Feature importances")
#plt.hist(importances,bins=10)
Example #31
0
    for c in Cvals:
        if n_cluster == 2:
            model = LogisticRegression(penalty='l2',
                                       C=c,
                                       class_weight='balanced')
        else:
            model = LogisticRegression(penalty='l2',
                                       C=c,
                                       class_weight='balanced',
                                       multi_class='ovr')
        score = cross_val(x_train, y_train, model, 5)
        #print score
        if score > bestscore:
            bestC = c
            bestscore = score

    model.C = bestC
    model.fit(x_train, y_train)
    preds = model.predict_proba(x_test)
    lb.fit(y_train)
    a = lb.transform(y_test)
    if (a.shape[1] != preds.shape[1]):
        a = np.vstack([a.T, np.ones(preds.shape[0])]).T
    if n_cluster == 2:
        auc_lr = roc_auc_score(y_test, preds[:, 1])
    else:
        auc_lr = roc_auc_score(a, preds)  #lb.transform(y_test))
    print 'best C:', bestC, '-- AUC of logistic regression is ', auc_lr
    time_lr = time.time() - start
    print 'time elapsed:', time_lr
Example #32
0
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1,test_size=0.2, train_size=0.8, random_state=0)

train_index, test_index = next(ss.split(X,y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

clf.C = 1e-3
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


X_test_value = clf.decision_function(X_test)

sorted_va  = np.sort(X_test_value)

plt.plot(X_test_value)
plt.plot([0,120],[0,0], linestyle='--')

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

plt.plot(sigmoid(sorted_va))
Example #33
0
MI = pd.DataFrame({'MI': MI}, index=selvars)
MI = MI.sort_values('MI', ascending=False)

var = [
    x for x in data.columns
    if 'RHI' in x or 'HSG' in x or 'AGE' in x or 'EDU' in x
]
X = data[var]
y = data['target']

clf = LogisticRegression()
Cs = np.logspace(-3, 3, 7)

skfold = StratifiedKFold(n_splits=5, shuffle=True)
result = pd.DataFrame(
    columns=['C', 'Fold', 'Acc', 'Recall', 'Precision', 'F1'])
for c in Cs:
    for pos, (train,
              valid) in enumerate(skfold.split(data[var], data['target'])):
        clf.C = c
        clf.fit(data.iloc[train][var], data.iloc[train]['target'])
        y_pred = clf.predict(data.iloc[valid][var])
        result.loc[len(result)] = [
            c, pos + 1,
            clf.score(data.iloc[valid][var], data.iloc[valid]['target']),
            recall_score(data.iloc[valid]['target'], y_pred),
            precision_score(data.iloc[valid]['target'], y_pred),
            f1_score(data.iloc[valid]['target'], y_pred)
        ]
result.groupby('C')['Acc', 'Recall', 'Precision', 'F1'].mean()
Example #34
0
File: rtb.py Project: zengqh/mydml
ustrain = undersample(train)

y = ustrain.convert
X = ustrain.drop('convert', axis=1)

print("Remaining rows", len(ustrain))

# LogisticRegression
C_s = np.logspace(-10, 1, 11)
scores = list()
scores_std = list()
lr = LogisticRegression(penalty='l1')

for C in C_s:
    lr.C = C
    this_scores = cross_val_score(lr, X, y, cv=4, scoring="roc_auc")
    scores.append(np.mean(this_scores))
    scores_std.append(np.std(this_scores))

lr_results = pd.DataFrame({'score': scores, 'C': C_s})
print(lr_results)

# RF
msl_s = [1, 2, 4, 8, 16, 32, 64, 128, 256]
scores = list()
scores = list()
rf = RandomForestClassifier(n_estimators=15)

for msl in msl_s:
    rf.min_samples_leaf = msl
                    1:]  #just want the probability that is it class 1 (active)

    #write to a file the probabilities:
    with open('LR1', 'w') as f:
        f.write("GeneId,Prediction\n")
        for i in range(len(active)):
            f.write("%.0f,%f\n" % (geneIDs[i], active[i]))

    #Experiment with parameter C and penalties l1 and l2 on un-split data:
    from sklearn.cross_validation import cross_val_score
    C_range = 10.0**np.arange(-5, 0)  #set 4 - slide 55

    scores = []
    for C in C_range:
        for penalty in ["l1", "l2"]:
            clf.C = C
            clf.penalty = penalty
            clf.fit(x_train, y_train)
            y_pred = clf.predict(x_test)
            score = cross_val_score(clf,
                                    x_train,
                                    y_train,
                                    cv=3,
                                    scoring='roc_auc')
            scores.append((C, penalty, score.mean()))

    print(round(scores, 7))  #highest auc achieved with 1e-2 and l1
    #[(1.0000000000000001e-05, 'l1', 0.5),
    #(1.0000000000000001e-05, 'l2', 0.9061314880194854),
    #(0.0001, 'l1', 0.85908152750914157),
    #(0.0001, 'l2', 0.9069741809466777),
	x_train_final=pd.DataFrame(x_train_final)
	x_train_final=pd.concat([pd.DataFrame(y_train),x_train_final],axis=1)
	x_test_final=pd.DataFrame(x_test_final)

LASSO_run=True
if LASSO_run:
	from sklearn.linear_model import LogisticRegression
	from sklearn.cross_validation import cross_val_score

	Cs=np.logspace(-1.5, 1.5, 10)
	lr_lasso = LogisticRegression(penalty='l1')
	cv_lasso_scores=list()

	# Fit lasso to various choices of regularization parameter C to select best C
	for c in Cs:
		lr_lasso.C = c
		cv_lasso_score = cross_val_score(lr_lasso, x_train_final, y_train, scoring='roc_auc', cv=5)
		cv_lasso_scores.append(np.mean(cv_lasso_score))

	print 'Best lambda based on Lasso Cross-Validation...'
	max_score=np.max(cv_lasso_scores)
	max_lambda_l1=Cs[cv_lasso_scores.index(max_score)]
	print 1.0/max_lambda_l1, max_score

	lr_lasso.C = max_lambda_l1
	lr_lasso.fit(x_train_final,y_train)
	print lr_lasso.coef_
	tmp_names=np.array(feature_names)
	selected_features=tmp_names[lr_lasso.coef_[0] != 0]
	print 'writing final selected features to file...'
	selected_features=pd.DataFrame(selected_features)