Exemple #1
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """

    kf = SKFold(n_splits=K)
    validation_dict = []
    temp = {}
    for c in C:
        for p in penalty:
            temp['C'] = c
            temp['penalty'] = p
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_test = y[train_idx], y[val_idx]
                y_pred, _ = pred_log(logreg,
                                     nsd(x_train, mode=mode, flag=False),
                                     y_train,
                                     nsd(x_val, mode=mode, flag=False),
                                     flag=True)
                all_classes = logreg.classes_
                loss_val_vec[k] = log_loss(y_test, y_pred, labels=all_classes)
                k += 1

            temp['mu'] = loss_val_vec.mean()
            temp['sigma'] = loss_val_vec.std()
            validation_dict.append(temp)
            temp = {}

    return validation_dict

    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]

    return validation_dict
def kfold_cv_train_set(train_feats, train_labels, true_labels, args):
    """ Run k-fold cross validation on train set

    Args:
        train_feats (np.ndarray): training features (n_samples x dim)
        train_labels (np.ndarray): corresponding labels
        args: args

    Returns:
        np.float64: average classification accuracy over k-folds
        np.float64: average cross-entropy loss over k-folds
    """

    skf = SKFold(n_splits=args.nf, shuffle=True, random_state=0)

    # [acc, x_entropy]
    scores = np.zeros(shape=(args.nf, 2))
    i = 0
    for trn_ixs, dev_ixs in skf.split(train_feats, train_labels):

        (_, _, acc,
         xen), _, _ = run_clf(train_feats[trn_ixs], train_labels[trn_ixs],
                              train_feats[dev_ixs], train_labels[dev_ixs],
                              true_labels, args)

        scores[i, :2] = acc, xen

        i += 1

    return np.mean(scores[:, 0]), np.mean(scores[:, 1])
Exemple #3
0
def kfold_cv_dev(train_feats, train_labels, n_folds=5):
    """ Run k-fold cross validation on train set

    Args:
        train_feats (np.ndarray): training features (n_samples x dim)
        train_labels (np.ndarray): corresponding labels
        n_folds (int): number of folds (default=5)

    Returns:
        np.float64: average classification accuracy over k-folds
        np.float64: average cross-entropy loss over k-folds
    """

    skf = SKFold(n_splits=n_folds, shuffle=True, random_state=0)

    # [acc, x_entropy]
    scores = np.zeros(shape=(n_folds, 2))
    i = 0
    for trn_ixs, dev_ixs in skf.split(train_feats, train_labels):

        scores[i, :2] = run_glc(train_feats[trn_ixs], train_labels[trn_ixs],
                                train_feats[dev_ixs], train_labels[dev_ixs])
        i += 1

    return np.mean(scores[:, 0]), np.mean(scores[:, 1])
Exemple #4
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """
    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        # ------------------ IMPLEMENT YOUR CODE HERE:-----------------------------
                y_train, y_val = y[train_idx], y[val_idx]
                x_train_nsd = nsd(x_train, selected_feat=('LB', 'ASTV'), mode=mode)
                x_val_nsd = nsd(x_val, selected_feat=('LB', 'ASTV'), mode=mode)
                y_pred, _ = pred_log(logreg, x_train_nsd, y_train, x_val_nsd, flag=True)
                loss_val_vec[k] = log_loss(y_val, y_pred)
                k += 1
            mu = loss_val_vec.mean()
            std = loss_val_vec.std()
            validation_dict.append({'C': c, 'penalty': p, 'mu': mu, 'sigma': std})
        # --------------------------------------------------------------------------
    return validation_dict
def cv_kfold_svm(X, y, C, K, gamma=[0], flag='linear'):
    kf = SKFold(n_splits=K)
    svc = svm.SVC(probability=True)
    pipe = Pipeline(steps=[('svm', svc)])
    if gamma == [0]:
        Svm = GridSearchCV(estimator=pipe,
                           param_grid={
                               'svm__kernel': [flag],
                               'svm__C': C
                           },
                           scoring=['roc_auc'],
                           cv=kf,
                           refit='roc_auc',
                           verbose=3,
                           return_train_score=True)
    else:
        Svm = GridSearchCV(estimator=pipe,
                           param_grid={
                               'svm__kernel': [flag],
                               'svm__C': C,
                               'svm__gamma': gamma
                           },
                           scoring=['roc_auc'],
                           cv=kf,
                           refit='roc_auc',
                           verbose=3,
                           return_train_score=True)

    Svm.fit(X, y)
    best_Svm = Svm.best_estimator_
    return best_Svm
Exemple #6
0
def getOptimalNumberFeatures(X, y):

    for c in X.columns:
        if X[c].dtype == 'object':
            lbl = LabelEncoder()
            lbl.fit(list(X[c].values))
            X[c] = lbl.transform(list(X[c].values))

    # The accuracy scoring is proportional to the number of correct classifications
    rfecv = RFECV(estimator=DecisionTreeClassifier(),
                  step=1,
                  cv=SKFold(5),
                  scoring='accuracy')
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    return rfecv.n_features_
Exemple #7
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """
    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            d = {}
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        # ------------------ IMPLEMENT YOUR CODE HERE:-----------------------------
                y_pred,w = pred_log(logreg, nsd(x_train, ('',''), mode=mode), y[train_idx], nsd(x_val, ('',''), mode=mode), flag=True)
                loss_val_vec[k]= log_loss(y[val_idx], y_pred)
                k+=1
            d['mu'] = loss_val_vec.mean()
            d['sigma'] = np.std(loss_val_vec)
            d['C'] = c
            d['penalty'] = p
            validation_dict.append(d)

        # --------------------------------------------------------------------------
    return validation_dict
Exemple #8
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """
    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        # ------------------ IMPLEMENT YOUR CODE HERE:-----------------------------

        # --------------------------------------------------------------------------
    return validation_dict
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """
    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
                # ------------------ IMPLEMENT YOUR CODE HERE:-----------------------------
                y_train, y_val = y[train_idx], y[val_idx]

                # First we scaled our training and validation data (for each fold)
                x_train = nsd(x_train, mode=mode)
                x_val = nsd(x_val, mode=mode)

                # fitting the model
                logreg.fit(x_train, y_train)

                # Predicting y probabilities for validation segment (based on fitted model)
                y_val_pred, _ = pred_log(logreg,
                                         x_train,
                                         y_train,
                                         x_val,
                                         flag=True)

                # Calculates the loss
                loss_val_vec[k] = log_loss(y_val, y_val_pred)

                k += 1

            mu = np.mean(loss_val_vec)
            std = loss_val_vec.std()
            validation_dict += [{'C': c, 'penalty': p, 'mu': mu, 'sigma': std}]
        # --------------------------------------------------------------------------
    return validation_dict
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """

    kf = SKFold(n_splits=K)
    validation_dict = []

    X = nsd(X, mode='standard', flag=False)

    i = 0
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_train_vec = np.zeros(K)
            loss_val_vec = np.zeros(K)

            k = 0

            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]

                logreg.fit(x_train, y_train)

                y_pred_train = logreg.predict_proba(x_train)
                y_pred_val = logreg.predict_proba(x_val)
                loss_train_vec[k] = log_loss(y_train, y_pred_train)
                loss_val_vec[k] = log_loss(y_val, y_pred_val)

                k += 1
            validation_dict.append({
                'C': c,
                'penalty': p,
                'mu': loss_val_vec.mean(),
                'sigma': loss_val_vec.std()
            })
            i += 1

    return validation_dict
Exemple #11
0
    def valid_sample(self, x, y, t_id):
        '''
        Determine whether the t-th feature in features is a positive training sample
        :param x: original features
        :param y: ground truth label
        :param t_id: index of feature to be transformed
        :param threshold: threshold of improvement of newly constructed feature
        :return: dictionary, like {'log':1, 'sigmoid':0} 1 for positive and 0 for not positive
        '''
        x = np.array(x)
        y = np.array(y)
        kfold = SKFold(n_splits=10)
        results_org = []
        results_new = {op: [] for op in unary_collection}

        for train_index, test_index in kfold.split(x, y):
            # Original feature
            rfc_org = RFC()
            rfc_org.fit(x[train_index, t_id:t_id + 1], y[train_index])
            pred_org = rfc_org.predict(x[test_index, t_id:t_id + 1])
            results_org.append(f1_score(y[test_index], pred_org))

            # Constructed feature
            for op in unary_collection:
                operator = op_dict[op]
                rfc_new = RFC()
                new_feature = operator.operate(x[train_index, t_id])
                new_feature = np.reshape(new_feature, (len(new_feature), 1))
                rfc_new.fit(new_feature, y[train_index])
                # print(op,Counter(list(x[test_index, t_id])))
                new_feature = operator.operate(x[test_index, t_id])
                # print(op,Counter(list(new_feature)))
                new_feature = np.reshape(new_feature, (len(new_feature), 1))
                pred_new = rfc_new.predict(new_feature)
                results_new[op].append(f1_score(y[test_index], pred_new))

        result_org = np.mean(results_org)
        result_dict = {}
        for key in results_new:
            result_new = np.mean(results_new[key])
            if result_new >= result_org * (1 + self.theta):
                result_dict[key] = 1
            else:
                result_dict[key] = 0

        return result_dict
Exemple #12
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """
    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
                # ------------------ IMPLEMENT YOUR CODE HERE:-----------------------------
                # For each parameter tested in the cross fold validation, we keep a dictionary
                # with the values of the parameters studied and the mean and standard deviation
                # of the loss of the logistic regression.
                y_train, y_val = y[train_idx], y[val_idx]
                y_pred_log, w_log = pred_log(logreg,
                                             nsd(x_train, mode=mode),
                                             y_train,
                                             nsd(x_val, mode=mode),
                                             flag=True)
                loss_val_vec[k] = log_loss(y_val, y_pred_log)
                k = k + 1
            elem_dict = {
                "C": c,
                "penalty": p,
                "mu": np.mean(loss_val_vec),
                "sigma": np.std(loss_val_vec)
            }
            validation_dict.append(elem_dict)
        # --------------------------------------------------------------------------
    return validation_dict
def cv_kfold_logreg(X, y, C, K):
    kf = SKFold(n_splits=K)
    params = {
        'classifier': [LogisticRegression()],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': C,
        'classifier__solver': ['liblinear']
    }
    pipe = Pipeline([('classifier', LogisticRegression())])
    logreg = GridSearchCV(estimator=pipe,
                          param_grid=params,
                          scoring=['roc_auc'],
                          cv=kf,
                          refit='roc_auc',
                          verbose=3,
                          return_train_score=True)
    logreg.fit(X, y)
    best_logreg = logreg.best_estimator_
    return best_logreg
Exemple #14
0
def CrossVal_Regression(k,eta,Lambda,X,z,activation_function_type,solver,n_hidden_neurons,epochs):
    """Cross Validation using Scikit Learn's MLPRegressor

    Parameters:
    Everything that is needed to create an MLPObject

    Returns:
        error estimates and R2 estimates for train and test error
    """

    kf=SKFold(n_splits=k,shuffle=True)
    Error_test = np.zeros(k); R2_test=np.zeros(k)
    Error_train=np.zeros(k); R2_train=np.zeros(k)
    scaler = StandardScaler()
    trainIndx, testIndx = KfoldCross(X,k) #Get random indices
    for i in range(k): #For the munber of cross validations
        """Seperate in training and testing sets, scale"""
        X_training = X[trainIndx[i],:]
        X_testing = X[testIndx[i],:]
        z_trainings = z[trainIndx[i]]
        z_testings = z[testIndx[i]]
        z_training=z_trainings-np.mean(z_trainings)
        z_testing=z_testings-np.mean(z_trainings)
        #Scale X
        scaler.fit(X_training)
        X_training_scaled = scaler.transform(X_training)
        X_testing_scaled = scaler.transform(X_testing)
        z_training=z_training.reshape((X_training_scaled.shape[0],1))
        z_testing=z_testing.reshape((X_testing_scaled.shape[0],1))
        regr=MLPRegressor(learning_rate_init=eta,max_iter=epochs,solver=solver,alpha=Lambda,
            hidden_layer_sizes=n_hidden_neurons,activation=activation_function_type).fit(X_training_scaled,z_training.ravel())

        prediction_train=regr.predict(X_training_scaled)
        prediction_test=regr.predict(X_testing_scaled)

        Error_train[i],R2_train[i] =MSE(z_training.ravel(),prediction_train), R2(z_training.ravel(),prediction_train)
        Error_test[i],R2_test[i]=MSE(z_testing.ravel(),prediction_test), R2(z_testing.ravel(),prediction_test)
    error_train_estimate = np.mean(Error_train);R2_train_estimate=np.mean(R2_train)
    error_test_estimate = np.mean(Error_test);R2_test_estimate=np.mean(R2_test)
    return error_test_estimate, error_train_estimate, R2_test_estimate, R2_train_estimate