Ejemplo n.º 1
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """

    kf = SKFold(n_splits=K)
    validation_dict = []
    temp = {}
    for c in C:
        for p in penalty:
            temp['C'] = c
            temp['penalty'] = p
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_test = y[train_idx], y[val_idx]
                y_pred, _ = pred_log(logreg,
                                     nsd(x_train, mode=mode, flag=False),
                                     y_train,
                                     nsd(x_val, mode=mode, flag=False),
                                     flag=True)
                all_classes = logreg.classes_
                loss_val_vec[k] = log_loss(y_test, y_pred, labels=all_classes)
                k += 1

            temp['mu'] = loss_val_vec.mean()
            temp['sigma'] = loss_val_vec.std()
            validation_dict.append(temp)
            temp = {}

    return validation_dict

    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]

    return validation_dict
Ejemplo n.º 2
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """
    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        # ------------------ IMPLEMENT YOUR CODE HERE:-----------------------------
                y_train, y_val = y[train_idx], y[val_idx]
                x_train_nsd = nsd(x_train, selected_feat=('LB', 'ASTV'), mode=mode)
                x_val_nsd = nsd(x_val, selected_feat=('LB', 'ASTV'), mode=mode)
                y_pred, _ = pred_log(logreg, x_train_nsd, y_train, x_val_nsd, flag=True)
                loss_val_vec[k] = log_loss(y_val, y_pred)
                k += 1
            mu = loss_val_vec.mean()
            std = loss_val_vec.std()
            validation_dict.append({'C': c, 'penalty': p, 'mu': mu, 'sigma': std})
        # --------------------------------------------------------------------------
    return validation_dict
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """
    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
                # ------------------ IMPLEMENT YOUR CODE HERE:-----------------------------
                y_train, y_val = y[train_idx], y[val_idx]

                # First we scaled our training and validation data (for each fold)
                x_train = nsd(x_train, mode=mode)
                x_val = nsd(x_val, mode=mode)

                # fitting the model
                logreg.fit(x_train, y_train)

                # Predicting y probabilities for validation segment (based on fitted model)
                y_val_pred, _ = pred_log(logreg,
                                         x_train,
                                         y_train,
                                         x_val,
                                         flag=True)

                # Calculates the loss
                loss_val_vec[k] = log_loss(y_val, y_val_pred)

                k += 1

            mu = np.mean(loss_val_vec)
            std = loss_val_vec.std()
            validation_dict += [{'C': c, 'penalty': p, 'mu': mu, 'sigma': std}]
        # --------------------------------------------------------------------------
    return validation_dict
Ejemplo n.º 4
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """
    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga', penalty=p, C=c, max_iter=10000, multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            d = {}
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        # ------------------ IMPLEMENT YOUR CODE HERE:-----------------------------
                y_pred,w = pred_log(logreg, nsd(x_train, ('',''), mode=mode), y[train_idx], nsd(x_val, ('',''), mode=mode), flag=True)
                loss_val_vec[k]= log_loss(y[val_idx], y_pred)
                k+=1
            d['mu'] = loss_val_vec.mean()
            d['sigma'] = np.std(loss_val_vec)
            d['C'] = c
            d['penalty'] = p
            validation_dict.append(d)

        # --------------------------------------------------------------------------
    return validation_dict
Ejemplo n.º 5
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """
    kf = SKFold(n_splits=K)
    validation_dict = []
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_val_vec = np.zeros(K)
            k = 0
            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
                # ------------------ IMPLEMENT YOUR CODE HERE:-----------------------------
                # For each parameter tested in the cross fold validation, we keep a dictionary
                # with the values of the parameters studied and the mean and standard deviation
                # of the loss of the logistic regression.
                y_train, y_val = y[train_idx], y[val_idx]
                y_pred_log, w_log = pred_log(logreg,
                                             nsd(x_train, mode=mode),
                                             y_train,
                                             nsd(x_val, mode=mode),
                                             flag=True)
                loss_val_vec[k] = log_loss(y_val, y_pred_log)
                k = k + 1
            elem_dict = {
                "C": c,
                "penalty": p,
                "mu": np.mean(loss_val_vec),
                "sigma": np.std(loss_val_vec)
            }
            validation_dict.append(elem_dict)
        # --------------------------------------------------------------------------
    return validation_dict
Ejemplo n.º 6
0
def cv_kfold(X, y, C, penalty, K, mode):
    """
    
    :param X: Training set samples
    :param y: Training set labels 
    :param C: A list of regularization parameters
    :param penalty: A list of types of norm
    :param K: Number of folds
    :param mode: Mode of normalization (parameter of norm_standard function in clean_data module)
    :return: A dictionary as explained in the notebook
    """

    kf = SKFold(n_splits=K)
    validation_dict = []

    X = nsd(X, mode='standard', flag=False)

    i = 0
    for c in C:
        for p in penalty:
            logreg = LogisticRegression(solver='saga',
                                        penalty=p,
                                        C=c,
                                        max_iter=10000,
                                        multi_class='ovr')
            loss_train_vec = np.zeros(K)
            loss_val_vec = np.zeros(K)

            k = 0

            for train_idx, val_idx in kf.split(X, y):
                x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]

                logreg.fit(x_train, y_train)

                y_pred_train = logreg.predict_proba(x_train)
                y_pred_val = logreg.predict_proba(x_val)
                loss_train_vec[k] = log_loss(y_train, y_pred_train)
                loss_val_vec[k] = log_loss(y_val, y_pred_val)

                k += 1
            validation_dict.append({
                'C': c,
                'penalty': p,
                'mu': loss_val_vec.mean(),
                'sigma': loss_val_vec.std()
            })
            i += 1

    return validation_dict
Ejemplo n.º 7
0
    #####################################

    feature = ''  # fill your chosen feature
    thresh = 10  # fill the threshold
    filt_feature = phpr(c_samp, feature, thresh)

    #####################################

    with open('objs.pkl', 'rb') as f:
        CTG_features, CTG_morph, fetal_state = pickle.load(f)
    orig_feat = CTG_features.columns.values

    #####################################

    selected_feat = ('LB', 'ASTV')
    orig = nsd(CTG_features, selected_feat, flag=False)
    nsd_std = nsd(CTG_features, selected_feat, mode='standard', flag=False)
    nsd_norm = nsd(CTG_features, selected_feat, mode='MinMax', flag=False)
    nsd_norm_mean = nsd(CTG_features, selected_feat, mode='mean', flag=False)

    #####################################

    g = sns.countplot(x='NSP', data=fetal_state)
    g.set(xticklabels=['Normal', 'Suspect', 'Pathology'])
    plt.show()
    idx_1 = (fetal_state == 1).index[(fetal_state == 1)['NSP'] == True].tolist()
    idx_2 = (fetal_state == 2).index[(fetal_state == 2)['NSP'] == True].tolist()
    idx_3 = (fetal_state == 3).index[(fetal_state == 3)['NSP'] == True].tolist()
    print("Normal samples account for " + str("{0:.2f}".format(100 * len(idx_1) / len(fetal_state))) + "% of the data.")
    print(
        "Suspect samples account for " + str("{0:.2f}".format(100 * len(idx_2) / len(fetal_state))) + "% of the data.")
Ejemplo n.º 8
0
import pickle

with open('objs.pkl', 'rb') as f:
    CTG_features, CTG_morph, fetal_state = pickle.load(f)
orig_feat = CTG_features.columns.values
# -

# Now we will address an important step in data science which is called feature scaling. Here we will discuss about standardization and normalization. As you saw in the lectures, scaling enables us to prepare features that take their value in different ranges and map them to a “normalized” features that take their values in similar ranges.
#
# Implement the function `norm_standard` that will have four inputs: `data`, `selected_feat`, `mode` and `flag`. The function will return the **whole data** normalized/standardized by series according to *mode*, but you should also choose two features for visualized comparison (using histograms) between the original data and the different modes. Use `matplotlib` as you saw in your tutorials. The argument `flag` is used for visibility of histograms.  There are three types of `mode`: `'standard','MinMax' and 'mean'`. Look for their meanings in the second lecture, slides 46-47. The first call uses `mode=none` and `flag=False` (as defaults). Don't change this default. The only variable you are allowed to change in the next call is `selected_feat`.

# +
# from clean_data import norm_standard as nsd
#
selected_feat = ('LB', 'ASTV')
orig = nsd(CTG_features, selected_feat, flag=True)
nsd_std = nsd(CTG_features, selected_feat, mode='standard', flag=True)
nsd_norm = nsd(CTG_features, selected_feat, mode='MinMax', flag=True)
nsd_norm_mean = nsd(CTG_features, selected_feat, mode='mean', flag=True)
# -

# ### Questions:`
# **Q4:** Explain why normalization is not useful when there are outliers with extremely large or small values.

# ### Answers:
# **Q4:**

# # Part II: Linear Classifiers

# Finally, after all of the hard work we can now harvest the fruits (your functions from Part I) in order to do some proper machine learning!
#
Ejemplo n.º 9
0
    feature = 'UC'
    thresh = 10  # Uterine contraction cannot be more then 10 contractions per 10 minutes.
    filt_feature = phpr(c_samp, feature, thresh)
    print(filt_feature)

    #####################################

    with open('objs.pkl', 'rb') as f:
        CTG_features, CTG_morph, fetal_state = pickle.load(f)
    orig_feat = CTG_features.columns.values

    #####################################

    selected_feat = ('LB', 'ASTV')
    orig = nsd(CTG_features, selected_feat, flag=False)
    nsd_std = nsd(CTG_features, selected_feat, mode='standard', flag=False)
    nsd_norm = nsd(CTG_features, selected_feat, mode='MinMax', flag=False)
    nsd_norm_mean = nsd(CTG_features, selected_feat, mode='mean', flag=False)

    nsd_std.hist(bins=100)
    nsd_norm.hist(bins=100)
    nsd_norm_mean.hist(bins=100)
    plt.show()

    # #####################################
    #
    g = sns.countplot(x='NSP', data=fetal_state)
    g.set(xticklabels=['Normal', 'Suspect', 'Pathology'])
    plt.show()
    idx_1 = (fetal_state == 1).index[(
Ejemplo n.º 10
0
    # #####################################

    feature = 'UC'
    thresh = 10  # Uterine contraction cannot be more then 10 contractions per 10 minutes.
    filt_feature = phpr(c_samp, feature, thresh)

    #####################################

    with open('objs.pkl', 'rb') as f:
        CTG_features, CTG_morph, fetal_state = pickle.load(f)
    orig_feat = CTG_features.columns.values

    #####################################

    selected_feat = ('LB', 'ASTV')
    orig = nsd(CTG_features, selected_feat, flag=True)
    nsd_std = nsd(CTG_features, selected_feat, mode='standard', flag=True)
    nsd_norm = nsd(CTG_features, selected_feat, mode='MinMax', flag=True)
    nsd_norm_mean = nsd(CTG_features, selected_feat, mode='mean', flag=True)

    #####################################

    g = sns.countplot(x='NSP', data=fetal_state)
    g.set(xticklabels=['Normal', 'Suspect', 'Pathology'])
    plt.show()
    idx_1 = (fetal_state == 1).index[(
        fetal_state == 1)['NSP'] == True].tolist()
    idx_2 = (fetal_state == 2).index[(
        fetal_state == 2)['NSP'] == True].tolist()
    idx_3 = (fetal_state == 3).index[(
        fetal_state == 3)['NSP'] == True].tolist()
Ejemplo n.º 11
0
import pickle

with open('objs.pkl', 'rb') as f:
    CTG_features, CTG_morph, fetal_state = pickle.load(f)
orig_feat = CTG_features.columns.values

# Now we will address an important step in data science which is called feature scaling. Here we will discuss about standardization and normalization. As you saw in the lectures, scaling enables us to prepare features that take their value in different ranges and map them to a “normalized” features that take their values in similar ranges.
#
# Implement the function `norm_standard` that will have four inputs: `data`, `selected_feat`, `mode` and `flag`. The function will return the **whole data** normalized/standardized by series according to *mode*, but you should also choose two features for visualized comparison (using histograms) between the original data and the different modes. Use `matplotlib` as you saw in your tutorials. The argument `flag` is used for visibility of histograms.  There are three types of `mode`: `'standard','MinMax' and 'mean'`. Look for their meanings in the second lecture, slides 46-47. The first call uses `mode=none` and `flag=False` (as defaults). Don't change this default. The only variable you are allowed to change in the next call is `selected_feat`.

# In[ ]:

from clean_data import norm_standard as nsd

selected_feat = ('LB', 'ASTV')
orig = nsd(CTG_features, selected_feat, flag=True)
nsd_std = nsd(CTG_features, selected_feat, mode='standard', flag=True)
nsd_norm = nsd(CTG_features, selected_feat, mode='MinMax', flag=True)
nsd_norm_mean = nsd(CTG_features, selected_feat, mode='mean', flag=True)

# ### Questions:
# **Q4:** Explain why normalization is not useful when there are outliers with extremely large or small values.

# ### Answers:
# **Q4:**

# # Part II: Linear Classifiers

# Finally, after all of the hard work we can now harvest the fruits (your functions from Part I) in order to do some proper machine learning!
#
# Note: It is recommended that you attend the second workshop for this part and use the notes in your homework folder.