Esempio n. 1
0
def with_cross_validation(train_data_x, train_data_y, regularization=False):
    c = 0.0005  # if regularization else NO_REGULARIZATION
    pen = 'l2' if regularization else 'none'
    percentage = [0.2, 0.4, 0.6, 0.8,
                  1]  # will learn with 20%, 40% ,.... 100% of the data
    acc_train = []
    acc_vald = []
    sampels_num = [int(x * len(train_data_x) * (4 / 5)) for x in percentage]
    for k in sampels_num:
        clf = LogisticRegression(C=c, max_iter=20000,
                                 penalty=pen)  # creat LR model
        result = cross_validate(clf,
                                train_data_x[:k],
                                train_data_y[:k],
                                cv=5,
                                scoring='accuracy',
                                return_train_score=True)  # 5-cross-validation
        acc_vald.append(
            round(np.sum(result['test_score']) / len(result['test_score']), 3))
        acc_train.append(
            round(
                np.sum(result['train_score']) / len(result['train_score']), 3))
    plot_train_vald(
        acc_train,
        acc_vald,
        sampels_num,
        x_label="Training set size (samples)",
        y_label="Mean Accuracy (%)",
        title="Mean Accuracy as function of training set size, cv (5), C={0}".
        format(get_reg_title(c)))
Esempio n. 2
0
def without_cross_validation(train_data_x, train_data_y, regularization=False):
    c = 10  # if regularization else NO_REGULARIZATION
    pen = 'l2' if regularization else 'none'
    training_set_size = int((4 / 5) * len(train_data_x))
    vald_x = train_data_x[training_set_size + 1:]
    vald_y = train_data_y[training_set_size + 1:]
    train_data_x = train_data_x[:
                                training_set_size]  # separate to validation set
    train_data_y = train_data_y[:training_set_size]
    percentage = [0.2, 0.4, 0.6, 0.8,
                  1]  # will learn with 20%, 40% ,.... 100% of the data
    acc_train = []
    acc_vald = []
    sampels_num = [int(x * len(train_data_x)) for x in percentage]
    for k in sampels_num:
        print(int(len(train_data_x) * k))
        clf = LogisticRegression(C=c, max_iter=20000, penalty=pen)
        clf.fit(train_data_x[:k], train_data_y[:k])  # learn the data
        y_hat = clf.predict(vald_x[:int(k /
                                        5)])  # predict on the validation set
        acc_vald.append(
            np.sum(y_hat == vald_y[:int(k / 5)]) / len(vald_y[:int(k / 5)]))
        y_hat = clf.predict(train_data_x[:k])
        acc_train.append(
            np.sum(y_hat == train_data_y[:k]) / len(train_data_y[:k]))
    plot_train_vald(
        acc_train,
        acc_vald,
        sampels_num,
        x_label="Training set size (samples)",
        y_label="Accuracy (%)",
        title=
        "Accuracy as function of training set size, single validation set, C={}"
        .format(get_reg_title(c)))
def with_cross_validation(train_data_x, train_data_y, regularization=False):
    alpha = 0.015 if regularization else NO_REGULARIZATION
    percentage = [0.2, 0.4, 0.6, 0.8,
                  1]  # will learn with 20%, 40% ,.... 100% of the data
    acc_train = []
    acc_vald = []
    sampels_num = [int(x * len(train_data_x) * (4 / 5)) for x in percentage]
    for k in sampels_num:
        # creat ann with one hidden layer (3 neurons)
        clf = MLPClassifier(hidden_layer_sizes=(3, ),
                            solver='sgd',
                            activation='relu',
                            alpha=alpha,
                            max_iter=20000)
        result = cross_validate(clf,
                                train_data_x[:k],
                                train_data_y[:k],
                                cv=5,
                                scoring='accuracy',
                                return_train_score=True)  # 5-cross-validation
        acc_vald.append(
            round(np.sum(result['test_score']) / len(result['test_score']), 3))
        acc_train.append(
            round(
                np.sum(result['train_score']) / len(result['train_score']), 3))
    plot_train_vald(
        acc_train,
        acc_vald,
        sampels_num,
        x_label="Training set size (samples)",
        y_label="Mean Accuracy (%)",
        title=
        "Mean Accuracy as function of training set size, cv (5), $\\alpha$={0}"
        .format(alpha))
Esempio n. 4
0
def with_cross_validation(train_data_x, train_data_y, regularization=False):
    c = 1 if regularization else NO_REGULARIZATION
    percentage = [0.2, 0.4, 0.6, 0.8,
                  1]  # will learn with 20%, 40% ,.... 100% of the data
    acc_train = []
    acc_vald = []
    samples_num = [int(x * len(train_data_x) * (4 / 5)) for x in percentage]
    for k in samples_num:
        clf = clf = svm.SVC(C=c,
                            kernel='poly',
                            degree=1,
                            gamma=1,
                            coef0=0,
                            max_iter=2000000)  # creat svm model
        result = cross_validate(clf,
                                train_data_x[:k],
                                train_data_y[:k],
                                cv=5,
                                scoring='accuracy',
                                return_train_score=True
                                )  # using a part of the data every iteration.
        acc_vald.append(
            round(np.sum(result['test_score']) / len(result['test_score']), 3))
        acc_train.append(
            round(
                np.sum(result['train_score']) / len(result['train_score']), 3))
    plot_train_vald(
        acc_train,
        acc_vald,
        samples_num,
        x_label="Training set size (samples)",
        y_label="Mean Accuracy (%)",
        title="Mean Accuracy as function of training set size, cv (5), C={0}".
        format(get_reg_title(c)))
Esempio n. 5
0
def without_cross_validation(train_data_x, train_data_y, regularization=False):
    c = 1 if regularization else NO_REGULARIZATION
    training_set_size = int((4 / 5) * len(train_data_x))
    vald_x = train_data_x[training_set_size + 1:]  # separate to validation set
    vald_y = train_data_y[training_set_size + 1:]
    train_data_x = train_data_x[:training_set_size]
    train_data_y = train_data_y[:training_set_size]
    percentage = [0.2, 0.4, 0.6, 0.8,
                  1]  # will learn with 20%, 40% ,.... 100% of the data
    acc_train = []
    acc_vald = []
    sampels_num = [int(x * len(train_data_x)) for x in percentage]
    for k in sampels_num:
        clf = svm.SVC(C=c,
                      kernel='poly',
                      degree=1,
                      gamma=1,
                      coef0=0,
                      max_iter=2000000)  # creat svm model

        clf.fit(train_data_x[:k], train_data_y[:k])  # train the model
        y_hat = clf.predict(vald_x[:int(k /
                                        5)])  # predict of the validation set
        acc_vald.append(
            np.sum(y_hat == vald_y[:int(k / 5)]) / len(vald_y[:int(k / 5)]))
        y_hat = clf.predict(train_data_x[:k])  # predict of the training set
        acc_train.append(
            np.sum(y_hat == train_data_y[:k]) / len(train_data_y[:k]))
    plot_train_vald(
        acc_train,
        acc_vald,
        sampels_num,
        x_label="Training set size (samples)",
        y_label="Accuracy (%)",
        title=
        "Accuracy as function of training set size, single validation set, C={}"
        .format(get_reg_title(c)))
def without_cross_validation(train_data_x, train_data_y, regularization=False):
    alpha = 0.015 if regularization else NO_REGULARIZATION
    training_set_size = int((4 / 5) * len(train_data_x))
    vald_x = train_data_x[training_set_size + 1:]  # separate to validation set
    vald_y = train_data_y[training_set_size + 1:]

    train_data_x = train_data_x[:training_set_size]
    train_data_y = train_data_y[:training_set_size]

    percentage = [0.2, 0.4, 0.6, 0.8,
                  1]  # will learn with 20%, 40% ,.... 100% of the data
    acc_train = []
    acc_vald = []
    sampels_num = [int(x * len(train_data_x)) for x in percentage]
    for k in sampels_num:
        clf = MLPClassifier(hidden_layer_sizes=(3, ),
                            solver='sgd',
                            activation='relu',
                            alpha=alpha,
                            max_iter=20000)
        clf.fit(train_data_x[:k], train_data_y[:k])  # trainin g the model
        y_hat = clf.predict(vald_x[:int(k / 5)])
        acc_vald.append(
            np.sum(y_hat == vald_y[:int(k / 5)]) / len(vald_y[:int(k / 5)]))
        y_hat = clf.predict(train_data_x[:k])  # predict on the validation set
        acc_train.append(
            np.sum(y_hat == train_data_y[:k]) / len(train_data_y[:k]))
    plot_train_vald(
        acc_train,
        acc_vald,
        sampels_num,
        x_label="Training set size (samples)",
        y_label="Accuracy (%)",
        title=
        "Accuracy as function of training set size, single validation set, $\\alpha$={0}"
        .format(alpha))