def ridge_dummy_regression(X,y,x_test,lambda_val=None): """ Train ridge L2 Logistic Regression on X,y. Then predict on x_test If lambda_val is provided, will just use this parameter for the L2 LR otherwise, will run 5-fold CV on C = log(-1.5, 1.5,5) This function returns a list of predicted probabilities as a list """ from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score Cs=np.logspace(-1.5, 1.5, 5) lr = LogisticRegression(penalty='l2') cv_list=list() if not lambda_val: # Fit ridge to various choices of regularization parameter C to select best C for c in Cs: lr.C = c cv_score = cross_val_score(lr, X, y, scoring='roc_auc', cv=5) cv_list.append(np.mean(cv_score)) print 'Best lambda based on Ridge Cross-Validation...' max_score=np.max(cv_list) lambda_val=Cs[cv_list.index(max_score)] print 1.0/lambda_val, max_score # Train LR with the optimized regularization parameter ### lr.C = lambda_val lr.fit(X,y) proba_lst = lr.predict_proba(x_test)[:,1] return proba_lst
def logistic_regression(x_train,y_train,x_test,penalty='L2', regularization=1.0, do_CV=False): from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import KFold ### Mean Normalize variables before regression ### from sklearn.preprocessing import StandardScaler ss=StandardScaler() x_train=ss.fit_transform(x_train) x_test=ss.fit_transform(x_test) lr=LogisticRegression() if penalty=='L1': lr = LogisticRegression(penalty='l1') filename="Lasso_submission.csv" else: lr = LogisticRegression(penalty='l2') filename="Ridge_submission.csv" if do_CV: Cs=np.logspace(-1.5, 1.5, 10) cv_list=list() ### Fit lasso to various choices of regularization parameter C to select optimal C for c in Cs: lr.C = c print 'Running K-fold CV with C = %.5f' % (1.0/c) cv_scores=udf.cross_val_score_proba(x_train,y_train,5,lr) cv_list.append(np.mean(cv_scores)) print 'Best lambda based on Cross-Validation...' max_score=np.max(cv_list) max_lambda=Cs[cv_list.index(max_score)] print 1.0/max_lambda, max_score else: print 'Making prediction with optimal lambda....' lr.C=1.0/regularization lr.fit(x_train,y_train) y_pred=lr.predict_proba(x_test)[:,1] print 'Coefficients of the regression:' print lr.coef_ print 'Writing submission file....' with open(filename,'wb') as testfile: w=csv.writer(testfile) w.writerow(('Id','Probability')) for i in range(len(y_pred)): w.writerow(((i+1),y_pred[i])) testfile.close() print 'File written to disk...'
def logisticcv1(): from sklearn.cross_validation import cross_val_score from sklearn.linear_model import LogisticRegressionCV from sklearn import datasets, metrics, model_selection import numpy as np from collections import Counter iris = sk.datasets.load_iris() iris_train = iris.data print(iris_train.shape) X = iris.data y = iris.target X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.33, random_state=42) print(X_train.shape, y_train.shape) #alphas = np.logspace(-2, -0.5, 30) alphas = np.arange(0.4, 0.55, 0.001) #logisticcv = LogisticRegressionCV(cv = 10, penalty = 'l1', solver = 'liblinear') logistic2 = LogisticRegression(penalty='l1', solver='liblinear') scores = list() maxscore = 0 maxa = 0 for a in alphas: #print(a) logistic2.C = a scores2 = cross_val_score(logistic2, X_train, y_train, cv=10) mean1 = scores2.mean() print("alpha and score", a, mean1) if mean1 > maxscore: maxscore = mean1 maxa = a #tup1 = tuple( (a,scores2.mean())) #scores.append(tup1) #print(scores) #print("Below cross_val_score train dataset",scores2, sep = "\n") #print("scores2 mean ", scores2.mean()) #this is not cv #list2 = [ b for b in scores] print(maxa, maxscore) logistic2.C = 0.5 c1 = Counter(y_test) print(c1) logistic2.fit(X_train, y_train) pred_values = logistic2.predict(X_test) cmatrix = confusion_matrix(y_test, pred_values) print("below is confusion matrix on train data set", cmatrix, sep="\n") print("coefficieinets ", logistic2.coef_) print("penalty function", logistic2.penalty, sep=" ")
def optimize_Logreg_model(data, selected_features, label_name, C_array, train_years=[2000, 2012], val_years=[2012, 2016]): reduced_data = reduce_data(data, label_name) X = reduced_data[selected_features] y = reduced_data[label_name] train_ind = np.array( reduced_data[(reduced_data.year >= train_years[0]) & ((reduced_data.year < train_years[1]))].index) val_ind = np.array( reduced_data[(reduced_data.year >= val_years[0]) & ((reduced_data.year < val_years[1]))].index) y_train = reduced_data[(reduced_data.year >= train_years[0]) & ( (reduced_data.year < train_years[1]))][label_name] n_pos = len(y_train[y_train == True]) n_neg = len(y_train[y_train == False]) W_neg = (1.0 / n_neg) W_pos = (1.0 / n_pos) Weights = {True: W_pos, False: W_neg} opt_model = LogisticRegression(C=1.0, class_weight=Weights, penalty='l1', fit_intercept=True, solver='liblinear', random_state=0) param_grid = {'C': C_array} scoring = sklm.make_scorer(positive_fscore) cv = ((train_ind, val_ind), ) GS = ms.GridSearchCV(estimator=opt_model, param_grid=param_grid, cv=cv, scoring=scoring, return_train_score=False, n_jobs=4) GS.fit(X, y) best_param = GS.best_params_['C'] opt_model.C = best_param test_scores = GS.cv_results_['mean_test_score'] return opt_model, test_scores
def logistic_(li_X, li_y): X = li_X y = li_y # Logistic Regression from sklearn.linear_model import LogisticRegression clf = LogisticRegression() clf.fit(X,y) clf.coef_ clf.intercept_ clf.score(X,y) y_pred = clf.predict(X) from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score recall_score(y, y_pred) precision_score(y, y_pred) f1_score(y, y_pred) accuracy_score(y, y_pred) from sklearn.model_selection import StratifiedKFold k = 5 skfold = StratifiedKFold(n_splits = k) accs = [] recall = [] precision = [] f1s = [] X = X.values y = y.values for train_set, valid_set in skfold.split(X,y): clf.C = 1 clf.fit(X[train_set], y[train_set]) y_pred = clf.predict(X[valid_set]) acc = accuracy_score(y[valid_set],y_pred) r = recall_score(y[valid_set], y_pred) p = precision_score(y[valid_set], y_pred) f1 = f1_score(y[valid_set], y_pred) accs.append(acc) recall.append(r) precision.append(p) f1s.append(f1) a = sum(accs)/len(accs) b = sum(recall)/len(recall) c = sum(precision)/len(precision) d = sum(f1s)/len(f1s) from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve y_prob = clf.predict_proba(X) roc_curve(y, y_prob[:,1], pos_label = 1) fpr, tpr, thres = roc_curve(y, y_prob[:,1], pos_label = 1) return a,b,c,d,plt.plot(fpr, tpr),roc_auc_score(y, y_prob[:,1])
def penalty_l2(X, y, l): clf = LogisticRegression().set_params(**params) clf.C = 1 / l clf.penalty = 'l2' #scores = cross_val_score(clf, X, y, cv=5) #print("cross validation scores of logistic regression model with l2 = {} are :".format(l), scores) #print("mean of cross validation scores of logistic regression with l2 = {} model is:".format(l), numpy.mean(scores)) clf.fit(X, y) return clf
def fit_logistic(X, y, min_c=1e-10, max_c=1e4, num_c=50, num_folds=5, X_holdout=None, model='gboost', min_prob=5e-3): class Dummy: def __init__(self, const): self.const = const def predict_proba(self, X): p = np.ones(len(X)) * self.const return np.array([1-p, p]).T if y.sum() == 0: if X_holdout is None: return np.zeros(len(y)) + min_prob, lambda: np.zeros(len(y), dtype=int) + min_prob, Dummy(min_prob) else: return np.zeros(X_holdout.shape[0]) + min_prob, lambda: np.zeros(X_holdout.shape[0], dtype=int) + min_prob, Dummy(min_prob) if y.sum() == len(y): if X_holdout is None: return np.ones(len(y)) - min_prob, lambda: np.ones(len(y), dtype=int) - min_prob, Dummy(1-min_prob) else: return np.ones(X_holdout.shape[0]) - min_prob, lambda: np.ones(X_holdout.shape[0], dtype=int) - min_prob, Dummy(1-min_prob) if model == 'lasso': from sklearn.linear_model import LogisticRegression # Use cross-validation to select lambda c_vals = np.exp(np.linspace(np.log(min_c), np.log(max_c), num_c)) cv_scores = np.zeros(num_c) folds = create_folds(X, num_folds) for i,fold in enumerate(folds): # print '\tFold #{0}'.format(i) mask = np.ones(len(X), dtype=bool) mask[fold] = False X_train, y_train = X[mask], y[mask] X_test, y_test = X[~mask], y[~mask] if y_train.sum() == 0: cv_scores += (1-y_test).sum() elif y_train.sum() == len(y_train): cv_scores += (y_test).sum() else: lr = LogisticRegression(penalty='l1', C=min_c, warm_start=True) for j,c in enumerate(c_vals): lr.C = c lr.fit(X_train, y_train) cv_scores[j] += lr.predict_log_proba(X_test)[:,y_test].sum() cv_scores /= float(len(X)) best_idx = np.argmax(cv_scores) best_c = c_vals[best_idx] lr = LogisticRegression(C=best_c) elif model == 'gboost': from sklearn.ensemble import GradientBoostingClassifier lr = GradientBoostingClassifier(subsample=0.5) lr.fit(X, y) if X_holdout is None: probs = lr.predict_proba(X)[:,1] else: probs = lr.predict_proba(X_holdout)[:,1] return probs, lambda: (np.random.random(size=len(probs)) <= probs).astype(int), lr
def ridge_dummy_regression(X, y, testData, lambda_val=None): """ Train ridge L2 Logistic Regression on X,y. Then predict on x_test If lambda_val is provided, will just use this parameter for the L2 LR otherwise, will run 5-fold CV on C = log(-1.5, 1.5,5) This function returns a list of predicted probabilities as a list """ from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score from sklearn.metrics import roc_auc_score Cs = np.logspace(-1.5, 1.5, 5) lr = LogisticRegression(penalty='l2') cv_list = list() if not lambda_val: # Fit ridge to various choices of regularization parameter C to select best C for c in Cs: lr.C = c ### randomly divide data into 80/20 split ### ### because response is very sparse ### from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42) lr.fit(X_train, y_train) y_pred = lr.predict_proba(X_test)[:, 1] cv_list.append(roc_auc_score(y_test, y_pred)) print 'Best lambda based on Ridge Cross-Validation...' max_score = np.max(cv_list) lambda_val = Cs[cv_list.index(max_score)] print 1.0 / lambda_val, max_score # Train LR with the optimized regularization parameter ### lr.C = lambda_val lr.fit(X, y) proba_lst = lr.predict_proba(testData)[:, 1] return proba_lst
def fit_Logreg_model(data, selected_features, label_name, C_array, n_splits=2, shuffle=True, shuffle_seed=10): reduced_data = reduce_data(data, label_name) X = reduced_data[selected_features] y = reduced_data[label_name] n_pos = len(y[y == True]) n_neg = len(y[y == False]) W_neg = (1.0 / n_neg) / (1.0 / n_pos + 1.0 / n_neg) W_pos = (1.0 / n_pos) / (1.0 / n_pos + 1.0 / n_neg) Weights = {True: W_pos, False: W_neg} opt_model = LogisticRegression(C=1.0, class_weight=Weights, penalty='l1', fit_intercept=True, solver='liblinear', random_state=0) param_grid = {'C': C_array} scoring = sklm.make_scorer(weighted_fscore) cv = ms.KFold(n_splits=n_splits, shuffle=shuffle, random_state=shuffle_seed) GS = ms.GridSearchCV(estimator=opt_model, param_grid=param_grid, cv=cv, scoring=scoring, return_train_score=False, n_jobs=4, iid=True) GS.fit(X, y) best_param = GS.best_params_['C'] opt_model.C = best_param mean_test_scores = GS.cv_results_['mean_test_score'] std_test_scores = GS.cv_results_['std_test_score'] return X, y, opt_model, mean_test_scores, std_test_scores
def ridge_dummy_regression(X,y,testData,lambda_val=None): """ Train ridge L2 Logistic Regression on X,y. Then predict on x_test If lambda_val is provided, will just use this parameter for the L2 LR otherwise, will run 5-fold CV on C = log(-1.5, 1.5,5) This function returns a list of predicted probabilities as a list """ from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score from sklearn.metrics import roc_auc_score Cs=np.logspace(-1.5, 1.5, 5) lr = LogisticRegression(penalty='l2') cv_list=list() if not lambda_val: # Fit ridge to various choices of regularization parameter C to select best C for c in Cs: lr.C = c ### randomly divide data into 80/20 split ### ### because response is very sparse ### from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=42) lr.fit(X_train,y_train) y_pred=lr.predict_proba(X_test)[:,1] cv_list.append(roc_auc_score(y_test,y_pred)) print 'Best lambda based on Ridge Cross-Validation...' max_score=np.max(cv_list) lambda_val=Cs[cv_list.index(max_score)] print 1.0/lambda_val, max_score # Train LR with the optimized regularization parameter ### lr.C = lambda_val lr.fit(X,y) proba_lst = lr.predict_proba(testData)[:,1] return proba_lst
def train_logistic(X,y): from sklearn.svm import SVC from sklearn.grid_search import GridSearchCV from sklearn.linear_model import LogisticRegression clf = LogisticRegression() Crange = np.logspace(-8, 1, 8) grid = GridSearchCV(clf, param_grid={'C': Crange},scoring='roc_auc', cv=5) grid.fit(X, y) clf.C = grid.best_params_['C'] clf.fit(X,y) return clf
def update(self): # Handle to storage for model parameters params = self._parameters print "Starting to update Logistic Regression!" # Make sure all my meta data is ready to go params.validateMeta() observation_vectors = [] truth_vectors = [] # Make sure my model data is ready to go self._model_data.validate() self._model_data.validateViews(self.getMetaData("db_views")) # Check my model data observation_vectors = self._model_data.getMetaData( "observation_vectors") truth_vectors = self._model_data.getMetaData("truth_vectors") params.setMetaData("db_views", []) # Houston we are go lr = LogisticRegression() lr.penalty = params.getMetaData("penalty") lr.dual = params.getMetaData("dual") lr.C = params.getMetaData("C") lr.fit_intercept = params.getMetaData("fit_intercept") lr.intercept_scaling = params.getMetaData("intercept_scaling") class_weight = params.getMetaData("class_weight") if (class_weight != None): lr.class_weight = class_weight lr.max_iter = params.getMetaData("max_iter") lr.random_state = params.getMetaData("random_state") lr.solver = params.getMetaData("solver") tol = params.getMetaData("tol") if (tol != None): lr.tol = tol lr.multi_class = params.getMetaData("multi_class") lr.verbose = params.getMetaData("verbose") # Evaluation mode loads several model artifacts from storage and sets them as inputs lr.fit(observation_vectors, truth_vectors) params.setBinaryData("lr_model", "application/pickle", pickle.dumps(lr)) self.finalize()
def build_model_rbm(): np.random.seed(12) rbm_estimators = list() # rbm = BernoulliRBM(random_state=12, verbose=0, n_components=in_dim) rbm = BernoulliRBM(random_state=np.random.randint(1, 100), verbose=0) lr = LogisticRegression() rbm.learning_rate = 0.0001 # rbm.n_iter = 20 # rbm.n_components = 50 lr.C = 10.0 rbm_estimators.append(('rbm', rbm)) rbm_estimators.append(('lr', lr)) return rbm_estimators
def HyperSearch(): # Courtesy of Miroslaw Horbal base = [127, 96, 53, 3, 103, 71, 151, 1, 65, 152] f = fileio.Preprocessed('../data/quads10Threshold.csv') f.encode(base) train, truth = f.transformTrain(base) print "Performing hyperparameter selection..." clf = LogisticRegression(C=2.3,class_weight='auto') # Hyperparameter selection loop score_hist = [] Cvals = np.linspace(1,4,32) eval_ = classifier.Classifier(train, truth) for C in Cvals: clf.C = C score = eval_.holdout(clf,nFolds=10,fraction=0.2) score_hist.append((score,C)) print "C: %f Mean AUC: %f" %(C, score) bestC = sorted(score_hist)[-1][1] print "Best C value: %f" % (bestC)
def HyperSearch(): # Courtesy of Miroslaw Horbal base = [127, 96, 53, 3, 103, 71, 151, 1, 65, 152] f = fileio.Preprocessed('../data/quads10Threshold.csv') f.encode(base) train, truth = f.transformTrain(base) print "Performing hyperparameter selection..." clf = LogisticRegression(C=2.3, class_weight='auto') # Hyperparameter selection loop score_hist = [] Cvals = np.linspace(1, 4, 32) eval_ = classifier.Classifier(train, truth) for C in Cvals: clf.C = C score = eval_.holdout(clf, nFolds=10, fraction=0.2) score_hist.append((score, C)) print "C: %f Mean AUC: %f" % (C, score) bestC = sorted(score_hist)[-1][1] print "Best C value: %f" % (bestC)
LRs = [] for train, test in cv.split(X, y): # clf = LogisticRegression(C=1) clf = LogisticRegressionCV() clf.fit(X[train], y[train]) y_pred = clf.predict(X[test]) scores.append(roc_auc_score(y[test], y_pred)) coefs.append(clf.coef_) Cs.append(clf.C_) LRs.append(clf) lr_mean = LogisticRegression() lr_mean.coef_ = np.asarray(coefs).mean(axis=0) lr_mean.C = np.asarray(Cs).mean() lr_mean.intercept_ = np.asarray([est.intercept_ for est in LRs]).mean() lr_coef_mean = np.asarray(coefs).mean(axis=0) lr_coef_std = np.asarray(coefs).std(axis=0) cv_scores = cross_val_score(lr_mean, X, y, scoring="roc_auc", cv=StratifiedKFold(9)) score_full_X, perm_scores_full_X, pvalue_full_X = permutation_test_score( lr_mean, X, y,
lcr = LogisticRegression() knn = KNeighborsClassifier(n_neighbors=10) rrc = RidgeClassifierCV(normalize=True) ada = AdaBoostClassifier() ada_dct = AdaBoostRegressor(DecisionTreeClassifier(max_depth=2),n_estimators=600, random_state=np.random.RandomState(1)) lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto') qda = QuadraticDiscriminantAnalysis() rfc = RandomForestClassifier() # Random Forests are gooooood!! gb = GradientBoostingClassifier(n_estimators=1000) dtr = DecisionTreeClassifier() rbm = BernoulliRBM(n_components=2) logistic = LogisticRegression() rbm.learning_rate = 0.06 rbm.n_iter = 20 rbm.n_components = 100 logistic.C = 1 rbm_lcr = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # --------------------------------------------------------------------- # # ----------------------------- HMM ----------------------------------- # # --------------------------------------------------------------------- #n_mix ''' markov = hmm.GaussianHMM(n_components=3, n_iter=500, init_params="mcst", covariance_type="full") ''' markov = hmm.GaussianHMM(n_components=3, n_iter=500, params="mcs", init_params="mcs", covariance_type="full") markov.transmat_ = np.array([[ 0.95354708, 0.04633496, 0.00011796], [ 0.04959727, 0.93909542, 0.01130731], [ 0.05827543, 0.00015793, 0.94156665]]) # We should try to learn parameters of MARKOV TRANSITION MATRIX !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # --------------------------------------------------------------------- # # ----------------------------- GMM ----------------------------------- #
def fit(self, X, y, Xstd=None): """Fit model to training data. Args: X (DataFrame): Binarized features with MultiIndex column labels y (array): Target variable Xstd (DataFrame, optional): Standardized numerical features Returns: LogisticRuleRegression: Self """ # Initialization # Number of samples n = X.shape[0] if self.init0: # Initialize with empty feature indicator and conjunction matrices z = pd.DataFrame([], index=X.columns) A = np.empty((X.shape[0], 0)) else: # Initialize with X itself i.e. singleton conjunctions # Feature indicator and conjunction matrices z = pd.DataFrame(np.eye(X.shape[1], dtype=int), index=X.columns) # Remove negations indPos = X.columns.get_level_values(1).isin(['', '<=', '==']) z = z.loc[:,indPos] A = X.loc[:,indPos].values # Scale conjunction matrix to account for non-uniform penalties A = A * self.lambda0 / (self.lambda0 + self.lambda1 * z.sum().values) if self.useOrd: self.namesOrd = Xstd.columns numOrd = Xstd.shape[1] # Scale ordinal features to have similar std as "average" binary feature Astd = 0.4 * Xstd.values # Iteration counter self.it = 0 # Logistic regression object lr = LogisticRegression( penalty='l1', C=1/(n*self.lambda0), solver='saga', multi_class='ovr', max_iter=self.maxSolverIter) self.p = y.mean() if self.init0: # Initial residual r = (self.p - y) / n # Derivative w.r.t. intercept term UB = min(r.sum(), 0) else: # Fit logistic regression model if self.useOrd: B = np.concatenate((Astd, A), axis=1) lr.fit(B, y) # Initial residual r = (lr.predict_proba(B)[:,1] - y) / n else: lr.fit(A, y) # Initial residual r = (lr.predict_proba(A)[:,1] - y) / n # Most "negative" subderivative among current variables (undo scaling) UB = -np.abs(np.dot(r, A)) UB *= (self.lambda0 + self.lambda1 * z.sum().values) / self.lambda0 UB += self.lambda0 + self.lambda1 * z.sum().values UB = min(UB.min(), 0) # Beam search for conjunctions with subdifferentials that exclude zero vp, zp, Ap = beam_search_K1(r, X, self.lambda0, self.lambda1, UB=UB, B=self.B, wLB=self.wLB, eps=self.eps, stopEarly=self.stopEarly) vn, zn, An = beam_search_K1(-r, X, self.lambda0, self.lambda1, UB=UB, B=self.B, wLB=self.wLB, eps=self.eps, stopEarly=self.stopEarly) v = np.append(vp, vn) while (v < UB).any() and (self.it < self.iterMax): # Subdifferentials excluding zero exist, continue self.it += 1 zNew = pd.concat([zp, zn], axis=1, ignore_index=True) Anew = np.concatenate((Ap, An), axis=1) # K conjunctions with largest subderivatives in absolute value idxLargest = np.argsort(v)[:self.K] v = v[idxLargest] zNew = zNew.iloc[:,idxLargest] Anew = Anew[:,idxLargest] # Scale new conjunction matrix to account for non-uniform penalties Anew = Anew * self.lambda0 / (self.lambda0 + self.lambda1 * zNew.sum().values) # Add to existing conjunctions z = pd.concat([z, zNew], axis=1, ignore_index=True) A = np.concatenate((A, Anew), axis=1) # Fit logistic regression model if self.useOrd: B = np.concatenate((Astd, A), axis=1) lr.fit(B, y) # Residual r = (lr.predict_proba(B)[:,1] - y) / n else: lr.fit(A, y) # Residual r = (lr.predict_proba(A)[:,1] - y) / n # Most "negative" subderivative among current variables (undo scaling) UB = -np.abs(np.dot(r, A)) UB *= (self.lambda0 + self.lambda1 * z.sum().values) / self.lambda0 UB += self.lambda0 + self.lambda1 * z.sum().values UB = min(UB.min(), 0) # Beam search for conjunctions with subdifferentials that exclude zero vp, zp, Ap = beam_search_K1(r, X, self.lambda0, self.lambda1, UB=UB, B=self.B, wLB=self.wLB, eps=self.eps, stopEarly=self.stopEarly) vn, zn, An = beam_search_K1(-r, X, self.lambda0, self.lambda1, UB=UB, B=self.B, wLB=self.wLB, eps=self.eps, stopEarly=self.stopEarly) v = np.append(vp, vn) # Restrict model to conjunctions with nonzero coefficients try: idxNonzero = np.where(np.abs(lr.coef_) > self.eps)[1] if self.useOrd: # Nonzero indices of standardized and rule features self.idxNonzeroOrd = idxNonzero[idxNonzero < numOrd] nnzOrd = len(self.idxNonzeroOrd) idxNonzeroRules = idxNonzero[idxNonzero >= numOrd] - numOrd if self.debias and len(idxNonzero): # Re-fit logistic regression model with effectively no regularization z = z.iloc[:,idxNonzeroRules] lr.C = 1 / self.eps lr.fit(B[:,idxNonzero], y) idxNonzero = np.where(np.abs(lr.coef_) > self.eps)[1] # Nonzero indices of standardized and rule features idxNonzeroOrd2 = idxNonzero[idxNonzero < nnzOrd] self.idxNonzeroOrd = self.idxNonzeroOrd[idxNonzeroOrd2] idxNonzeroRules = idxNonzero[idxNonzero >= nnzOrd] - nnzOrd self.z = z.iloc[:,idxNonzeroRules] lr.coef_ = lr.coef_[:,idxNonzero] else: if self.debias and len(idxNonzero): # Re-fit logistic regression model with effectively no regularization z = z.iloc[:,idxNonzero] lr.C = 1 / self.eps lr.fit(A[:,idxNonzero], y) idxNonzero = np.where(np.abs(lr.coef_) > self.eps)[1] self.z = z.iloc[:,idxNonzero] lr.coef_ = lr.coef_[:,idxNonzero] except AttributeError: # Model has no coefficients except intercept self.z = z self.lr = lr
scores = np.zeros((1, len(solvers), len(C_values)), dtype=np.float) log_reg.penalty = 'l2' log_reg.max_iter = 6000 start_time = time.time() for i, dataset in enumerate(data): for j, solver in enumerate(solvers): for l, c in enumerate(C_values): # Since each dataset has the same order of data, just with different preprocessing, we can use the same k-folds for # each variant of the data (same applies for labels) folds = cv_folds.split(int_train_data, y_train) # Changing hyperparameter for this run log_reg.solver = solver log_reg.C = c lap = time.time() - start_time print("Start time: %d m %.2f s" % (int(lap / 60), lap - (int(lap / 60) * 60))) print("Training logisitic regression model on %s with solver: %s" % (dataset, solver)) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") cross_val_scores = np.zeros(num_folds, dtype=np.float) # K fold cross-validation, saving the score for each version of the model in scores for k, (train, val) in enumerate(folds): log_reg.fit(np.take(data[dataset], train, axis=0), np.take(y_train, train))
a=auc_score(labels2[te],lr.predict_proba(tr2[te])[:,1]) print('added %s auc: %3f'%(collab[i],a)) aucholder[i]=a # successively add best features (first 20) tr2=trdata[:,(f527col,f528col)] sortedfeatinds = aucholder.argsort()[::-1] # best first Cs=np.logspace(0,10,5) bestc=np.zeros(20) besta=np.zeros(20) for i in range(20): toadd=sortedfeatinds[i] tr2=np.append(tr2,trdata[:,toadd].reshape(nrows,1),axis=1) print('adding feature: %s number: %d'%(collab[toadd],i)) for c in Cs: lr.C=c lr.fit(tr2[tr],labels2[tr]) a=auc_score(labels2[te],lr.predict_proba(tr2[te])[:,1]) print('AUC: %3f with C: %f'%(a,c)) if a>besta[i]: bestc[i]=c besta[i]=a # best score of all: bestoveralla=besta.argsort()[-1] bestcforbesta=bestc[bestoveralla] print('best auc of %3f with %d features added and C=%f'%(besta.max(),(bestoveralla+1),bestcforbesta)) bestfeats = [f527col,f528col]+sortedfeatinds[:(bestoveralla+1)].tolist() trfinal=trdata[:,tuple(bestfeats)]
def makePrediction(training_date, days): #training_date = datetime.datetime(2014,1,22) #title = 'scores/accuracy_' + str(days) + '.txt' #print title #text_file = open(title, "w") snpret = create_lagged_series("NDAQ", training_date, datetime.datetime(2015, 5, 26), lags=5) #print snpret # Use the prior two days of returns as predictor values, with direction as the response X = snpret[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"]] y = snpret["Direction"] # The test data is split into two parts: Before and after 1st Jan 2005. start_test = datetime.datetime(2015, 3, 17) # Create training and test sets X_train = X[X.index < start_test] X_test = X[X.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] # Create prediction DataFrame pred = pd.DataFrame(index=y_test.index) #print pred pred["Actual"] = y_test # Create and fit the three models print "Hit Rates:" models = [("Linear", linear_model.LinearRegression()), ("LR", LogisticRegression()), ("KNN", neighbors.KNeighborsClassifier(n_neighbors=3)), ("SVM", SVC(C=10)), ("RF", RandomForestClassifier(n_estimators=4))] for m in models: fit_model(m[0], m[1], X_train, y_train, X_test, pred) logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = .06 rbm.n_iter = 15 rbm.n_components = 100 logistic.C = 6000 classifier.fit(X_train, y_train) logistic_classifier = LogisticRegression(C=100.0) logistic_classifier.fit(X_train, y_train) score = classifier.score(X_train, y_train) print score text_file.write('Neural Network : ' + str(score) + '\n') # 100 Days text_file.write('100 Days Prediction Accuracies\n') snpret = create_lagged_series("NDAQ", training_date, datetime.datetime(2015, 8, 6), lags=5) #print snpret # Use the prior two days of returns as predictor values, with direction as the response X = snpret[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"]] y = snpret["Direction"] # The test data is split into two parts: Before and after 1st Jan 2005. start_test = datetime.datetime(2015, 3, 17) # Create training and test sets X_train = X[X.index < start_test] X_test = X[X.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] # Create prediction DataFrame pred = pd.DataFrame(index=y_test.index) #print pred pred["Actual"] = y_test # Create and fit the three models print "Hit Rates:" models = [("Linear", linear_model.LinearRegression()), ("LR", LogisticRegression()), ("KNN", neighbors.KNeighborsClassifier(n_neighbors=3)), ("SVM", SVC(C=10)), ("RF", RandomForestClassifier(n_estimators=4))] for m in models: fit_model(m[0], m[1], X_train, y_train, X_test, pred) logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = .06 rbm.n_iter = 15 rbm.n_components = 100 logistic.C = 6000 classifier.fit(X_train, y_train) logistic_classifier = LogisticRegression(C=100.0) logistic_classifier.fit(X_train, y_train) score = classifier.score(X_train, y_train) print score text_file.write('Neural Network : ' + str(score) + '\n') # 200 Days text_file.write('200 Days Prediction Accuracies\n') snpret = create_lagged_series("NDAQ", training_date, datetime.datetime(2015, 12, 31), lags=5) #print snpret # Use the prior two days of returns as predictor values, with direction as the response X = snpret[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5"]] y = snpret["Direction"] # The test data is split into two parts: Before and after 1st Jan 2005. start_test = datetime.datetime(2015, 3, 17) # Create training and test sets X_train = X[X.index < start_test] X_test = X[X.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] # Create prediction DataFrame pred = pd.DataFrame(index=y_test.index) #print pred pred["Actual"] = y_test # Create and fit the three models print "Hit Rates:" models = [("Linear", linear_model.LinearRegression()), ("LR", LogisticRegression()), ("KNN", neighbors.KNeighborsClassifier(n_neighbors=3)), ("SVM", SVC(C=10)), ("RF", RandomForestClassifier(n_estimators=4))] for m in models: fit_model(m[0], m[1], X_train, y_train, X_test, pred) logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = .06 rbm.n_iter = 15 rbm.n_components = 100 logistic.C = 6000 classifier.fit(X_train, y_train) logistic_classifier = LogisticRegression(C=100.0) logistic_classifier.fit(X_train, y_train) score = classifier.score(X_train, y_train) print score text_file.write('Neural Network : ' + str(score)) text_file.close()
print "Hit Rates:" models = [("Linear", linear_model.LinearRegression()), ("LR", LogisticRegression()), ("KNN", neighbors.KNeighborsClassifier(n_neighbors=2)), ("SVM", SVC(C=1)), ("RF", RandomForestClassifier(n_estimators=1))] for m in models: fit_model(m[0], m[1], X_train, y_train, X_test, pred) logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) rbm.learning_rate = .06 rbm.n_iter = 20 rbm.n_components = 100 logistic.C = 6000 classifier.fit(X_train, y_train) logistic_classifier = LogisticRegression(C=100.0) logistic_classifier.fit(X_train, y_train) score = classifier.score(X_train, y_train) print score text_file.write('Neural Network : ' + str(score) + '\n') # 100 Days text_file.write('100 Days Prediction Accuracies\n') snpret = create_lagged_series("NDAQ", training_date, datetime.datetime(2015, 8, 6), lags=5) #print snpret # Use the prior two days of returns as predictor values, with direction as the response
X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] from sklearn.preprocessing import MinMaxScaler scale = MinMaxScaler() scale.fit(X_train) X_train = scale.transform(X_train) X_test = scale.transform(X_test) from sklearn.linear_model import LogisticRegression clf = LogisticRegression() clf.C = 1 clf.fit(X_train, y_train) clf.score(X_test, y_test) C_range = [1e-5, 1e-3, 1e-2, 1, 1e2, 1e5, 1e10] C_range_exp = np.arange(-15.0, 21.0) C_range = 10**C_range_exp from sklearn.model_selection import GridSearchCV param = {'C': C_range} gs = GridSearchCV(clf, param) gs.fit(X_train, y_train)
# -*- coding: utf-8 -*- import scipy.io import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score #import matplotlib.pyplot as plt #from sklearn.model_selection import GridSearchCV C_range = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1] scores = [] parameters = {'C':C_range} mat = scipy.io.loadmat("./input/arcene.mat") y_train = np.ravel(mat["y_train"]) y_test = np.ravel(mat["y_test"]) X_train = mat["X_train"] X_test = mat["X_test"] model = LogisticRegression(penalty='l1') for C in C_range: model.C = C model.penalty = 'l1' model.fit(X_train, y_train) model.coef_ y_pred = model.predict(X_test) score = accuracy_score(y_test, y_pred) scores.append(score) print("Cls with penalty l1 and C: " + str(C) + "'. Score: " + str(score))
arcene = loadmat('arcene.mat') X_train = arcene['X_train'] X_test = arcene['X_test'] y_train = arcene['y_train'] y_test = arcene['y_test'] y_train = np.ravel(y_train) y_test = np.ravel(y_test) clf = LogisticRegression(penalty='l1') C_range = 10.0**np.arange(0, 5) scores = [] for C in C_range: clf.C = C #selector = RFECV(clf, step=50 , verbose=1) # selector.fit(X_train, y_train) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) score = accuracy_score(y_test, y_pred) scores.append(score) clf.C = C_range[np.argmax(scores)] clf.fit(X_train, y_train) #selector = RFECV(clf, step=50 , verbose=1) #selector.fit(X_train, y_train) '''count = 0 for x in clf.coef_:
X_new2.shape #%% Seperate train and test data from sklearn.model_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=0) for train_index, test_index in sss.split(X_new2, Y): X_train, X_test = X_new2[train_index], X_new2[test_index] Y_train, Y_test = Y[train_index], Y[test_index] #%% ## Logistic Regression with Lasso(CV) Cs = np.logspace(-8, -2, 10) scores = [] clf_l1_LR = LogisticRegression(penalty='l1', solver='liblinear') for C in Cs: clf_l1_LR.C = C K = 3 kf = KFold(n_splits=K) score = [] for train, test in kf.split(X_train, Y_train): clf_l1_LR.fit(X_train[train], Y_train[train]) y_pred = clf_l1_LR.predict_proba(X_train[test]) score.append(brier_score_loss(Y_train[test], y_pred[:, 1])) scores.append(np.mean(score)) C_optim_l1 = Cs[scores.index(min(scores))] clf_l1_LR.C = C_optim_l1 clf_l1_LR.fit(X_train, Y_train) ## Visualize the average scores for different shrinkage parameters plt.plot(np.log10(Cs), scores)
warnings.filterwarnings('ignore') clf = LogisticRegression(penalty='l1') C_array = 10**np.arange(0, 15, 0.5) mat = loadmat('arcene\\arcene.mat') #print(mat.keys()) X_test = mat['X_test'] X_train = mat['X_train'] y_test = mat['y_test'].ravel() y_train = mat['y_train'].ravel() opt_c = 0 score = 0 for c in C_array: clf.C = c clf.fit(X_train, y_train) preds = clf.predict(X_test) #print('# of selected features:', clf.coef_) if np.mean(cross_val_score(clf, X_test, y_test)) > score: score = np.mean(cross_val_score(clf, X_test, y_test)) opt_c = c print('Max score:', score, 'C-value:', opt_c) clf.C = opt_c clf.fit(X_train, y_train) f_selected = np.count_nonzero(clf.coef_) print('Features selected:', f_selected) print('Accuracy score:', np.mean(cross_val_score(clf, X_test, y_test)))
import matplotlib.pyplot as plt from sklearn.feature_selection import RFECV from sklearn.linear_model import LogisticRegression data = loadmat('arcene.mat') x_train = data['X_train'] x_test = data['X_test'] y_train = data['y_train'].ravel() y_test = data['y_test'].ravel() clf = LogisticRegression(penalty='l1',random_state=0) C_range = [] for i in range(-10,10): C_range.append(float('10e'+str(i))) C_range = np.asarray(C_range) clf.fit(x_train,y_train) for C in C_range: clf.C = C clf.fit(x_train, y_train) print('Number of selected features', np.count_nonzero(clf.coef_)) predict = clf.predict(x_test) print('Accuracy: ', accuracy_score(y_test, predict)) input('C value: ' + str(C))
mat = loadmat("arcene") print(mat.keys()) #load dictonary data in arrays p_test = mat["X_test"] p_train = mat["X_train"] q_test = mat["y_test"].ravel() q_train = mat["y_train"].ravel() C_range = 10.0**np.arange(-4, 3) clf = LR() accuracy_mat = [] weights = [] for C in C_range: clf.C = C clf.fit(p_train, q_train) pred = clf.predict(p_test) accuracy = 100.0 * np.mean(pred == q_test) accuracy_mat.append(accuracy) weights.append(clf.coef_) indices = np.argsort(accuracy_mat)[::-1] print("Accuracy for C = %.2e is %.1f %% (||w|| = %.4f)" % \ (C_range[indices[:10]], accuracy[indices[:10]], np.linalg.norm(weights[indices[:10]]))) #plot sparseness plt.figure() plt.title("Feature importances") #plt.hist(importances,bins=10)
for c in Cvals: if n_cluster == 2: model = LogisticRegression(penalty='l2', C=c, class_weight='balanced') else: model = LogisticRegression(penalty='l2', C=c, class_weight='balanced', multi_class='ovr') score = cross_val(x_train, y_train, model, 5) #print score if score > bestscore: bestC = c bestscore = score model.C = bestC model.fit(x_train, y_train) preds = model.predict_proba(x_test) lb.fit(y_train) a = lb.transform(y_test) if (a.shape[1] != preds.shape[1]): a = np.vstack([a.T, np.ones(preds.shape[0])]).T if n_cluster == 2: auc_lr = roc_auc_score(y_test, preds[:, 1]) else: auc_lr = roc_auc_score(a, preds) #lb.transform(y_test)) print 'best C:', bestC, '-- AUC of logistic regression is ', auc_lr time_lr = time.time() - start print 'time elapsed:', time_lr
from sklearn.model_selection import ShuffleSplit ss = ShuffleSplit(n_splits=1,test_size=0.2, train_size=0.8, random_state=0) train_index, test_index = next(ss.split(X,y)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] from sklearn.linear_model import LogisticRegression clf = LogisticRegression() clf.fit(X_train, y_train) clf.score(X_test, y_test) clf.C = 1e-3 clf.fit(X_train, y_train) clf.score(X_test, y_test) X_test_value = clf.decision_function(X_test) sorted_va = np.sort(X_test_value) plt.plot(X_test_value) plt.plot([0,120],[0,0], linestyle='--') def sigmoid(x): return 1 / (1 + np.exp(-x)) plt.plot(sigmoid(sorted_va))
MI = pd.DataFrame({'MI': MI}, index=selvars) MI = MI.sort_values('MI', ascending=False) var = [ x for x in data.columns if 'RHI' in x or 'HSG' in x or 'AGE' in x or 'EDU' in x ] X = data[var] y = data['target'] clf = LogisticRegression() Cs = np.logspace(-3, 3, 7) skfold = StratifiedKFold(n_splits=5, shuffle=True) result = pd.DataFrame( columns=['C', 'Fold', 'Acc', 'Recall', 'Precision', 'F1']) for c in Cs: for pos, (train, valid) in enumerate(skfold.split(data[var], data['target'])): clf.C = c clf.fit(data.iloc[train][var], data.iloc[train]['target']) y_pred = clf.predict(data.iloc[valid][var]) result.loc[len(result)] = [ c, pos + 1, clf.score(data.iloc[valid][var], data.iloc[valid]['target']), recall_score(data.iloc[valid]['target'], y_pred), precision_score(data.iloc[valid]['target'], y_pred), f1_score(data.iloc[valid]['target'], y_pred) ] result.groupby('C')['Acc', 'Recall', 'Precision', 'F1'].mean()
ustrain = undersample(train) y = ustrain.convert X = ustrain.drop('convert', axis=1) print("Remaining rows", len(ustrain)) # LogisticRegression C_s = np.logspace(-10, 1, 11) scores = list() scores_std = list() lr = LogisticRegression(penalty='l1') for C in C_s: lr.C = C this_scores = cross_val_score(lr, X, y, cv=4, scoring="roc_auc") scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) lr_results = pd.DataFrame({'score': scores, 'C': C_s}) print(lr_results) # RF msl_s = [1, 2, 4, 8, 16, 32, 64, 128, 256] scores = list() scores = list() rf = RandomForestClassifier(n_estimators=15) for msl in msl_s: rf.min_samples_leaf = msl
1:] #just want the probability that is it class 1 (active) #write to a file the probabilities: with open('LR1', 'w') as f: f.write("GeneId,Prediction\n") for i in range(len(active)): f.write("%.0f,%f\n" % (geneIDs[i], active[i])) #Experiment with parameter C and penalties l1 and l2 on un-split data: from sklearn.cross_validation import cross_val_score C_range = 10.0**np.arange(-5, 0) #set 4 - slide 55 scores = [] for C in C_range: for penalty in ["l1", "l2"]: clf.C = C clf.penalty = penalty clf.fit(x_train, y_train) y_pred = clf.predict(x_test) score = cross_val_score(clf, x_train, y_train, cv=3, scoring='roc_auc') scores.append((C, penalty, score.mean())) print(round(scores, 7)) #highest auc achieved with 1e-2 and l1 #[(1.0000000000000001e-05, 'l1', 0.5), #(1.0000000000000001e-05, 'l2', 0.9061314880194854), #(0.0001, 'l1', 0.85908152750914157), #(0.0001, 'l2', 0.9069741809466777),
x_train_final=pd.DataFrame(x_train_final) x_train_final=pd.concat([pd.DataFrame(y_train),x_train_final],axis=1) x_test_final=pd.DataFrame(x_test_final) LASSO_run=True if LASSO_run: from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score Cs=np.logspace(-1.5, 1.5, 10) lr_lasso = LogisticRegression(penalty='l1') cv_lasso_scores=list() # Fit lasso to various choices of regularization parameter C to select best C for c in Cs: lr_lasso.C = c cv_lasso_score = cross_val_score(lr_lasso, x_train_final, y_train, scoring='roc_auc', cv=5) cv_lasso_scores.append(np.mean(cv_lasso_score)) print 'Best lambda based on Lasso Cross-Validation...' max_score=np.max(cv_lasso_scores) max_lambda_l1=Cs[cv_lasso_scores.index(max_score)] print 1.0/max_lambda_l1, max_score lr_lasso.C = max_lambda_l1 lr_lasso.fit(x_train_final,y_train) print lr_lasso.coef_ tmp_names=np.array(feature_names) selected_features=tmp_names[lr_lasso.coef_[0] != 0] print 'writing final selected features to file...' selected_features=pd.DataFrame(selected_features)