def cv_kfold(X, y, C, penalty, K, mode): """ :param X: Training set samples :param y: Training set labels :param C: A list of regularization parameters :param penalty: A list of types of norm :param K: Number of folds :param mode: Mode of normalization (parameter of norm_standard function in clean_data module) :return: A dictionary as explained in the notebook """ kf = SKFold(n_splits=K) validation_dict = [] temp = {} for c in C: for p in penalty: temp['C'] = c temp['penalty'] = p logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr') loss_val_vec = np.zeros(K) k = 0 for train_idx, val_idx in kf.split(X, y): x_train, x_val = X.iloc[train_idx], X.iloc[val_idx] y_train, y_test = y[train_idx], y[val_idx] y_pred, _ = pred_log(logreg, nsd(x_train, mode=mode, flag=False), y_train, nsd(x_val, mode=mode, flag=False), flag=True) all_classes = logreg.classes_ loss_val_vec[k] = log_loss(y_test, y_pred, labels=all_classes) k += 1 temp['mu'] = loss_val_vec.mean() temp['sigma'] = loss_val_vec.std() validation_dict.append(temp) temp = {} return validation_dict kf = SKFold(n_splits=K) validation_dict = [] for c in C: for p in penalty: logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr') loss_val_vec = np.zeros(K) k = 0 for train_idx, val_idx in kf.split(X, y): x_train, x_val = X.iloc[train_idx], X.iloc[val_idx] return validation_dict
def kfold_cv_train_set(train_feats, train_labels, true_labels, args): """ Run k-fold cross validation on train set Args: train_feats (np.ndarray): training features (n_samples x dim) train_labels (np.ndarray): corresponding labels args: args Returns: np.float64: average classification accuracy over k-folds np.float64: average cross-entropy loss over k-folds """ skf = SKFold(n_splits=args.nf, shuffle=True, random_state=0) # [acc, x_entropy] scores = np.zeros(shape=(args.nf, 2)) i = 0 for trn_ixs, dev_ixs in skf.split(train_feats, train_labels): (_, _, acc, xen), _, _ = run_clf(train_feats[trn_ixs], train_labels[trn_ixs], train_feats[dev_ixs], train_labels[dev_ixs], true_labels, args) scores[i, :2] = acc, xen i += 1 return np.mean(scores[:, 0]), np.mean(scores[:, 1])
def kfold_cv_dev(train_feats, train_labels, n_folds=5): """ Run k-fold cross validation on train set Args: train_feats (np.ndarray): training features (n_samples x dim) train_labels (np.ndarray): corresponding labels n_folds (int): number of folds (default=5) Returns: np.float64: average classification accuracy over k-folds np.float64: average cross-entropy loss over k-folds """ skf = SKFold(n_splits=n_folds, shuffle=True, random_state=0) # [acc, x_entropy] scores = np.zeros(shape=(n_folds, 2)) i = 0 for trn_ixs, dev_ixs in skf.split(train_feats, train_labels): scores[i, :2] = run_glc(train_feats[trn_ixs], train_labels[trn_ixs], train_feats[dev_ixs], train_labels[dev_ixs]) i += 1 return np.mean(scores[:, 0]), np.mean(scores[:, 1])
def cv_kfold(X, y, C, penalty, K, mode): """ :param X: Training set samples :param y: Training set labels :param C: A list of regularization parameters :param penalty: A list of types of norm :param K: Number of folds :param mode: Mode of normalization (parameter of norm_standard function in clean_data module) :return: A dictionary as explained in the notebook """ kf = SKFold(n_splits=K) validation_dict = [] for c in C: for p in penalty: logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr') loss_val_vec = np.zeros(K) k = 0 for train_idx, val_idx in kf.split(X, y): x_train, x_val = X.iloc[train_idx], X.iloc[val_idx] # ------------------ IMPLEMENT YOUR CODE HERE:----------------------------- y_train, y_val = y[train_idx], y[val_idx] x_train_nsd = nsd(x_train, selected_feat=('LB', 'ASTV'), mode=mode) x_val_nsd = nsd(x_val, selected_feat=('LB', 'ASTV'), mode=mode) y_pred, _ = pred_log(logreg, x_train_nsd, y_train, x_val_nsd, flag=True) loss_val_vec[k] = log_loss(y_val, y_pred) k += 1 mu = loss_val_vec.mean() std = loss_val_vec.std() validation_dict.append({'C': c, 'penalty': p, 'mu': mu, 'sigma': std}) # -------------------------------------------------------------------------- return validation_dict
def cv_kfold_svm(X, y, C, K, gamma=[0], flag='linear'): kf = SKFold(n_splits=K) svc = svm.SVC(probability=True) pipe = Pipeline(steps=[('svm', svc)]) if gamma == [0]: Svm = GridSearchCV(estimator=pipe, param_grid={ 'svm__kernel': [flag], 'svm__C': C }, scoring=['roc_auc'], cv=kf, refit='roc_auc', verbose=3, return_train_score=True) else: Svm = GridSearchCV(estimator=pipe, param_grid={ 'svm__kernel': [flag], 'svm__C': C, 'svm__gamma': gamma }, scoring=['roc_auc'], cv=kf, refit='roc_auc', verbose=3, return_train_score=True) Svm.fit(X, y) best_Svm = Svm.best_estimator_ return best_Svm
def getOptimalNumberFeatures(X, y): for c in X.columns: if X[c].dtype == 'object': lbl = LabelEncoder() lbl.fit(list(X[c].values)) X[c] = lbl.transform(list(X[c].values)) # The accuracy scoring is proportional to the number of correct classifications rfecv = RFECV(estimator=DecisionTreeClassifier(), step=1, cv=SKFold(5), scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() return rfecv.n_features_
def cv_kfold(X, y, C, penalty, K, mode): """ :param X: Training set samples :param y: Training set labels :param C: A list of regularization parameters :param penalty: A list of types of norm :param K: Number of folds :param mode: Mode of normalization (parameter of norm_standard function in clean_data module) :return: A dictionary as explained in the notebook """ kf = SKFold(n_splits=K) validation_dict = [] for c in C: for p in penalty: logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr') loss_val_vec = np.zeros(K) k = 0 d = {} for train_idx, val_idx in kf.split(X, y): x_train, x_val = X.iloc[train_idx], X.iloc[val_idx] # ------------------ IMPLEMENT YOUR CODE HERE:----------------------------- y_pred,w = pred_log(logreg, nsd(x_train, ('',''), mode=mode), y[train_idx], nsd(x_val, ('',''), mode=mode), flag=True) loss_val_vec[k]= log_loss(y[val_idx], y_pred) k+=1 d['mu'] = loss_val_vec.mean() d['sigma'] = np.std(loss_val_vec) d['C'] = c d['penalty'] = p validation_dict.append(d) # -------------------------------------------------------------------------- return validation_dict
def cv_kfold(X, y, C, penalty, K, mode): """ :param X: Training set samples :param y: Training set labels :param C: A list of regularization parameters :param penalty: A list of types of norm :param K: Number of folds :param mode: Mode of normalization (parameter of norm_standard function in clean_data module) :return: A dictionary as explained in the notebook """ kf = SKFold(n_splits=K) validation_dict = [] for c in C: for p in penalty: logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr') loss_val_vec = np.zeros(K) k = 0 for train_idx, val_idx in kf.split(X, y): x_train, x_val = X.iloc[train_idx], X.iloc[val_idx] # ------------------ IMPLEMENT YOUR CODE HERE:----------------------------- # -------------------------------------------------------------------------- return validation_dict
def cv_kfold(X, y, C, penalty, K, mode): """ :param X: Training set samples :param y: Training set labels :param C: A list of regularization parameters :param penalty: A list of types of norm :param K: Number of folds :param mode: Mode of normalization (parameter of norm_standard function in clean_data module) :return: A dictionary as explained in the notebook """ kf = SKFold(n_splits=K) validation_dict = [] for c in C: for p in penalty: logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr') loss_val_vec = np.zeros(K) k = 0 for train_idx, val_idx in kf.split(X, y): x_train, x_val = X.iloc[train_idx], X.iloc[val_idx] # ------------------ IMPLEMENT YOUR CODE HERE:----------------------------- y_train, y_val = y[train_idx], y[val_idx] # First we scaled our training and validation data (for each fold) x_train = nsd(x_train, mode=mode) x_val = nsd(x_val, mode=mode) # fitting the model logreg.fit(x_train, y_train) # Predicting y probabilities for validation segment (based on fitted model) y_val_pred, _ = pred_log(logreg, x_train, y_train, x_val, flag=True) # Calculates the loss loss_val_vec[k] = log_loss(y_val, y_val_pred) k += 1 mu = np.mean(loss_val_vec) std = loss_val_vec.std() validation_dict += [{'C': c, 'penalty': p, 'mu': mu, 'sigma': std}] # -------------------------------------------------------------------------- return validation_dict
def cv_kfold(X, y, C, penalty, K, mode): """ :param X: Training set samples :param y: Training set labels :param C: A list of regularization parameters :param penalty: A list of types of norm :param K: Number of folds :param mode: Mode of normalization (parameter of norm_standard function in clean_data module) :return: A dictionary as explained in the notebook """ kf = SKFold(n_splits=K) validation_dict = [] X = nsd(X, mode='standard', flag=False) i = 0 for c in C: for p in penalty: logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr') loss_train_vec = np.zeros(K) loss_val_vec = np.zeros(K) k = 0 for train_idx, val_idx in kf.split(X, y): x_train, x_val = X.iloc[train_idx], X.iloc[val_idx] y_train, y_val = y[train_idx], y[val_idx] logreg.fit(x_train, y_train) y_pred_train = logreg.predict_proba(x_train) y_pred_val = logreg.predict_proba(x_val) loss_train_vec[k] = log_loss(y_train, y_pred_train) loss_val_vec[k] = log_loss(y_val, y_pred_val) k += 1 validation_dict.append({ 'C': c, 'penalty': p, 'mu': loss_val_vec.mean(), 'sigma': loss_val_vec.std() }) i += 1 return validation_dict
def valid_sample(self, x, y, t_id): ''' Determine whether the t-th feature in features is a positive training sample :param x: original features :param y: ground truth label :param t_id: index of feature to be transformed :param threshold: threshold of improvement of newly constructed feature :return: dictionary, like {'log':1, 'sigmoid':0} 1 for positive and 0 for not positive ''' x = np.array(x) y = np.array(y) kfold = SKFold(n_splits=10) results_org = [] results_new = {op: [] for op in unary_collection} for train_index, test_index in kfold.split(x, y): # Original feature rfc_org = RFC() rfc_org.fit(x[train_index, t_id:t_id + 1], y[train_index]) pred_org = rfc_org.predict(x[test_index, t_id:t_id + 1]) results_org.append(f1_score(y[test_index], pred_org)) # Constructed feature for op in unary_collection: operator = op_dict[op] rfc_new = RFC() new_feature = operator.operate(x[train_index, t_id]) new_feature = np.reshape(new_feature, (len(new_feature), 1)) rfc_new.fit(new_feature, y[train_index]) # print(op,Counter(list(x[test_index, t_id]))) new_feature = operator.operate(x[test_index, t_id]) # print(op,Counter(list(new_feature))) new_feature = np.reshape(new_feature, (len(new_feature), 1)) pred_new = rfc_new.predict(new_feature) results_new[op].append(f1_score(y[test_index], pred_new)) result_org = np.mean(results_org) result_dict = {} for key in results_new: result_new = np.mean(results_new[key]) if result_new >= result_org * (1 + self.theta): result_dict[key] = 1 else: result_dict[key] = 0 return result_dict
def cv_kfold(X, y, C, penalty, K, mode): """ :param X: Training set samples :param y: Training set labels :param C: A list of regularization parameters :param penalty: A list of types of norm :param K: Number of folds :param mode: Mode of normalization (parameter of norm_standard function in clean_data module) :return: A dictionary as explained in the notebook """ kf = SKFold(n_splits=K) validation_dict = [] for c in C: for p in penalty: logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr') loss_val_vec = np.zeros(K) k = 0 for train_idx, val_idx in kf.split(X, y): x_train, x_val = X.iloc[train_idx], X.iloc[val_idx] # ------------------ IMPLEMENT YOUR CODE HERE:----------------------------- # For each parameter tested in the cross fold validation, we keep a dictionary # with the values of the parameters studied and the mean and standard deviation # of the loss of the logistic regression. y_train, y_val = y[train_idx], y[val_idx] y_pred_log, w_log = pred_log(logreg, nsd(x_train, mode=mode), y_train, nsd(x_val, mode=mode), flag=True) loss_val_vec[k] = log_loss(y_val, y_pred_log) k = k + 1 elem_dict = { "C": c, "penalty": p, "mu": np.mean(loss_val_vec), "sigma": np.std(loss_val_vec) } validation_dict.append(elem_dict) # -------------------------------------------------------------------------- return validation_dict
def cv_kfold_logreg(X, y, C, K): kf = SKFold(n_splits=K) params = { 'classifier': [LogisticRegression()], 'classifier__penalty': ['l1', 'l2'], 'classifier__C': C, 'classifier__solver': ['liblinear'] } pipe = Pipeline([('classifier', LogisticRegression())]) logreg = GridSearchCV(estimator=pipe, param_grid=params, scoring=['roc_auc'], cv=kf, refit='roc_auc', verbose=3, return_train_score=True) logreg.fit(X, y) best_logreg = logreg.best_estimator_ return best_logreg
def CrossVal_Regression(k,eta,Lambda,X,z,activation_function_type,solver,n_hidden_neurons,epochs): """Cross Validation using Scikit Learn's MLPRegressor Parameters: Everything that is needed to create an MLPObject Returns: error estimates and R2 estimates for train and test error """ kf=SKFold(n_splits=k,shuffle=True) Error_test = np.zeros(k); R2_test=np.zeros(k) Error_train=np.zeros(k); R2_train=np.zeros(k) scaler = StandardScaler() trainIndx, testIndx = KfoldCross(X,k) #Get random indices for i in range(k): #For the munber of cross validations """Seperate in training and testing sets, scale""" X_training = X[trainIndx[i],:] X_testing = X[testIndx[i],:] z_trainings = z[trainIndx[i]] z_testings = z[testIndx[i]] z_training=z_trainings-np.mean(z_trainings) z_testing=z_testings-np.mean(z_trainings) #Scale X scaler.fit(X_training) X_training_scaled = scaler.transform(X_training) X_testing_scaled = scaler.transform(X_testing) z_training=z_training.reshape((X_training_scaled.shape[0],1)) z_testing=z_testing.reshape((X_testing_scaled.shape[0],1)) regr=MLPRegressor(learning_rate_init=eta,max_iter=epochs,solver=solver,alpha=Lambda, hidden_layer_sizes=n_hidden_neurons,activation=activation_function_type).fit(X_training_scaled,z_training.ravel()) prediction_train=regr.predict(X_training_scaled) prediction_test=regr.predict(X_testing_scaled) Error_train[i],R2_train[i] =MSE(z_training.ravel(),prediction_train), R2(z_training.ravel(),prediction_train) Error_test[i],R2_test[i]=MSE(z_testing.ravel(),prediction_test), R2(z_testing.ravel(),prediction_test) error_train_estimate = np.mean(Error_train);R2_train_estimate=np.mean(R2_train) error_test_estimate = np.mean(Error_test);R2_test_estimate=np.mean(R2_test) return error_test_estimate, error_train_estimate, R2_test_estimate, R2_train_estimate