Ejemplo n.º 1
0
def learning_curve(classifier, y, train, cv, n=15):
    """Plot train and cv loss for increasing train sample sizes."""
    chunk = int(len(y)/n)
    n_samples = []
    train_losses = []
    cv_losses = []
    previous_cache_dir = classifier.cache_dir
    classifier.cache_dir = "diagnostics"

    for i in range(n):
        train_subset = train[:(i + 1)*chunk]
        preds_cv = classifier.fit_predict(y, train_subset, cv,
                                          show_steps=False)
        preds_train = classifier.fit_predict(y, train_subset, train_subset,
                                             show_steps=False)
        n_samples.append((i + 1)*chunk)
        cv_losses.append(hinge_loss(y[cv], preds_cv))
        train_losses.append(hinge_loss(y[train_subset], preds_train))

    classifier.cache_dir = previous_cache_dir
    plt.clf()
    plt.plot(n_samples, train_losses, 'r--', n_samples, cv_losses, 'b--')
    plt.ylim([min(train_losses) - .01, max(cv_losses) + .01])

    plt.savefig('plots/learning_curve.png')
    plt.show()
Ejemplo n.º 2
0
def check_lambda(datanm, samples_per_class, Cs, num_classes, gamma, num_iter = 100, kernel = 'linear', strat = 'ovr'):
    data, labels = load_full(datanm, samples_per_class)
    slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.3, train_size=0.7, random_state=None)
    ans = np.zeros((len(Cs), len(gamma), 4))
    for train_index, test_index in slo:
        train_data = [data[train_index, :], labels[train_index]]
        valid_data = [data[test_index , :], labels[test_index ]]

        for j, g in enumerate(gamma):
            for i, C in enumerate(Cs):
                clf = svm.SVC(C=C, kernel=kernel, degree=3, gamma=g, coef0=0.0, shrinking=True,
                                  probability=False, tol=0.001,  cache_size=10000, class_weight=None,
                                  verbose=False, max_iter=-1, decision_function_shape=strat, random_state=None)
                clf.fit(train_data[0], train_data[1])

                out_train = clf.decision_function(train_data[0])
                out_valid = clf.decision_function(valid_data[0])

                ans[i, j, 2] += hinge_loss(train_data[1], out_train, range(num_classes))
                ans[i, j, 3] += hinge_loss(valid_data[1], out_valid, range(num_classes))

                #ans[i, j, 0] += log_loss(train_data[1], clf.predict_proba(train_data[0]))
                #ans[i, j, 1] += log_loss(valid_data[1], clf.predict_proba(valid_data[0]))

    ans[:, :, :] /= num_iter

    np.savez("svm_lambda_" + kernel + '_' + strat, ans= ans, Cs = Cs, num_iter = num_iter, num_classes = num_classes, samples_per_class = samples_per_class)
    return ans
Ejemplo n.º 3
0
def svm(x, y, p):

    ##Default parameters
    default_parameters = [
        0.001, 1000
    ]  ##Parameters with order: Tolerance, maximum iteration

    ##set custom parameters
    for i in range(len(p)):
        if p[i] != "":
            default_parameters[i] = p[i]

    ##create model
    model = LinearSVC(tol=default_parameters[0],
                      max_iter=default_parameters[1])

    ##Train and test
    accuracy = [
        model.fit(x[train], y[train]).score(x[test], y[test])
        for train, test in kf.split(x)
    ]
    res = np.array(accuracy)

    print("\nSupport Vector Machine\n----------------------\nAccuracy: %.2f" %
          res.mean())
    print("Loss: %.2f" % hinge_loss(y, model.decision_function(x)))

    info = [
        '%.2f' % res.mean(),
        '%.2f' % hinge_loss(y, model.decision_function(x))
    ]

    return model, info, default_parameters
Ejemplo n.º 4
0
def learning_curve(classifier, y, train, cv, n=15):
    """Plot train and cv loss for increasing train sample sizes."""
    chunk = int(len(y)/n)
    n_samples = []
    train_losses = []
    cv_losses = []
    previous_cache_dir = classifier.cache_dir
    classifier.cache_dir = "diagnostics"

    for i in range(n):
        train_subset = train[:(i + 1)*chunk]
        preds_cv = classifier.fit_predict(y, train_subset, cv,
                                          show_steps=False)
        preds_train = classifier.fit_predict(y, train_subset, train_subset,
                                             show_steps=False)
        n_samples.append((i + 1)*chunk)
        cv_losses.append(hinge_loss(y[cv], preds_cv, neg_label=0))
        train_losses.append(hinge_loss(y[train_subset], preds_train,
                            neg_label=0))

    classifier.cache_dir = previous_cache_dir
    plt.clf()
    plt.plot(n_samples, train_losses, 'r--', n_samples, cv_losses, 'b--')
    plt.ylim([min(train_losses) - .01, max(cv_losses) + .01])

    plt.savefig('plots/learning_curve.png')
    plt.show()
Ejemplo n.º 5
0
def test_hinge_loss_binary():
    y_true = np.array([-1, 1, 1, -1])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4)

    y_true = np.array([0, 2, 2, 0])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4)
Ejemplo n.º 6
0
def test_hinge_loss_binary():
    y_true = np.array([-1, 1, 1, -1])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4)

    y_true = np.array([0, 2, 2, 0])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4)
Ejemplo n.º 7
0
def test_hinge_loss_binary():
    y_true = np.array([-1, 1, 1, -1])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision))

    y_true = np.array([0, 2, 2, 0])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision, pos_label=2, neg_label=0))
Ejemplo n.º 8
0
def test_hinge_loss_binary():
    y_true = np.array([-1, 1, 1, -1])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision))

    y_true = np.array([0, 2, 2, 0])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(1.2 / 4,
                 hinge_loss(y_true, pred_decision, pos_label=2, neg_label=0))
Ejemplo n.º 9
0
def test_hinge_loss_binary():
    y_true = np.array([-1, 1, 1, -1])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision))

    with warnings.catch_warnings():
        # Test deprecated pos_label
        assert_equal(hinge_loss(-y_true, pred_decision), hinge_loss(y_true, pred_decision, pos_label=-1, neg_label=1))

    y_true = np.array([0, 2, 2, 0])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision))
    with warnings.catch_warnings():
        # Test deprecated pos_label
        assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision, pos_label=2, neg_label=0))
Ejemplo n.º 10
0
def svm_classify(all_x, all_y, n_pixels):
    # read the original data
    accuracies = np.zeros([len(all_y), len(frames)])
    losses = np.zeros([len(all_y), len(frames)])
    coef = np.zeros([len(all_y), len(frames), n_pixels])

    for idx, validation_y in enumerate(all_y):
        train_indx = np.delete(np.arange(len(all_y)), idx)
        for f in range(n_frames):
            train_x = all_x[train_indx, f, :]
            validation_x = all_x[idx, f, :].reshape([1, -1])
            m = np.mean(train_x, axis=0)
            s = np.std(train_x, axis=0)
            train_x = (train_x - m) / s
            validation_x = (validation_x - m) / s

            train_y = all_y[train_indx]
            clf = svm.SVC(kernel='linear', class_weight='balanced')
            clf.fit(train_x, train_y)

            coef[idx, f, :] = clf.coef_

            pred = clf.predict(validation_x)
            acc = pred == validation_y
            accuracies[idx, f] = acc
            est = clf.decision_function(validation_x)
            loss = hinge_loss([validation_y], est)
            losses[idx, f] = loss
    return accuracies, losses, coef
Ejemplo n.º 11
0
    def validation_metric_vw(self):
        y_pred_holdout = self.get_y_pred_holdout()

        if self.outer_loss_function == 'logistic':
            if self.labels_clf_count > 2:
                y_pred_holdout_proba = y_pred_holdout
            else:
                y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            loss = log_loss(self.y_true_holdout, y_pred_holdout_proba)

        elif self.outer_loss_function == 'squared':
            loss = mean_squared_error(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'hinge':
            loss = hinge_loss(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'pr-auc':
            loss = -average_precision_score(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'roc-auc':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            fpr, tpr, _ = roc_curve(self.y_true_holdout, y_pred_holdout_proba)
            loss = -auc(fpr, tpr)

        else:
            raise KeyError('Invalide outer loss function')

        self.logger.info('parameter suffix: %s' % self.param_suffix)
        self.logger.info('loss value: %.6f' % loss)

        return loss
Ejemplo n.º 12
0
def test_hinge_loss_multiclass_invariance_lists():
    # Currently, invariance of string and integer labels cannot be tested
    # in common invariance tests because invariance tests for multiclass
    # decision functions is not implemented yet.
    y_true = ['blue', 'green', 'red',
              'green', 'white', 'red']
    pred_decision = [
        [0.36, -0.17, -0.58, -0.99],
        [-0.55, -0.38, -0.48, -0.58],
        [-1.45, -0.58, -0.38,  -0.17],
        [-0.55, -0.38, -0.48, -0.58],
        [-2.36, -0.79, -0.27,  0.24],
        [-1.45, -0.58, -0.38,  -0.17]]
    dummy_losses = np.array([
        1 - pred_decision[0][0] + pred_decision[0][1],
        1 - pred_decision[1][1] + pred_decision[1][2],
        1 - pred_decision[2][2] + pred_decision[2][3],
        1 - pred_decision[3][1] + pred_decision[3][2],
        1 - pred_decision[4][3] + pred_decision[4][2],
        1 - pred_decision[5][2] + pred_decision[5][3]
    ])
    dummy_losses[dummy_losses <= 0] = 0
    dummy_hinge_loss = np.mean(dummy_losses)
    assert_equal(hinge_loss(y_true, pred_decision),
                 dummy_hinge_loss)
Ejemplo n.º 13
0
    def validation_metric_vw(self):
        y_pred_holdout = self.get_y_pred_holdout()

        if self.outer_loss_function == 'logistic':
            if self.labels_clf_count > 2:
                y_pred_holdout_proba = y_pred_holdout
            else:
                y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            loss = log_loss(self.y_true_holdout, y_pred_holdout_proba)

        elif self.outer_loss_function == 'squared':
            loss = mean_squared_error(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'hinge':
            loss = hinge_loss(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'pr-auc':
            loss = -average_precision_score(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'roc-auc':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            fpr, tpr, _ = roc_curve(self.y_true_holdout, y_pred_holdout_proba)
            loss = -auc(fpr, tpr)

        else:
            raise KeyError('Invalide outer loss function')

        self.logger.info('parameter suffix: %s' % self.param_suffix)
        self.logger.info('loss value: %.6f' % loss)

        return loss
Ejemplo n.º 14
0
def test_hinge_loss_multiclass_invariance_lists():
    # Currently, invariance of string and integer labels cannot be tested
    # in common invariance tests because invariance tests for multiclass
    # decision functions is not implemented yet.
    y_true = ['blue', 'green', 'red',
              'green', 'white', 'red']
    pred_decision = [
        [0.36, -0.17, -0.58, -0.99],
        [-0.55, -0.38, -0.48, -0.58],
        [-1.45, -0.58, -0.38,  -0.17],
        [-0.55, -0.38, -0.48, -0.58],
        [-2.36, -0.79, -0.27,  0.24],
        [-1.45, -0.58, -0.38,  -0.17]]
    dummy_losses = np.array([
        1 - pred_decision[0][0] + pred_decision[0][1],
        1 - pred_decision[1][1] + pred_decision[1][2],
        1 - pred_decision[2][2] + pred_decision[2][3],
        1 - pred_decision[3][1] + pred_decision[3][2],
        1 - pred_decision[4][3] + pred_decision[4][2],
        1 - pred_decision[5][2] + pred_decision[5][3]
    ])
    dummy_losses[dummy_losses <= 0] = 0
    dummy_hinge_loss = np.mean(dummy_losses)
    assert_equal(hinge_loss(y_true, pred_decision),
                 dummy_hinge_loss)
Ejemplo n.º 15
0
 def obj(self, data, score, C1, C2):
     d, n = data.weight.shape[0], len(data.y)
     embed_loss = norm(self.X - data.weight.dot(
         self.X), 'fro')**2 / d + self.delta * norm(self.X, 'fro')**2 / d
     obj = n*hinge_loss(y_true=data.y, pred_decision=score) \
       +1./(2.*C1)*np.sum(self.beta**2) + 1./(2.*C2)*embed_loss
     return obj
Ejemplo n.º 16
0
    def validation_metric_vw(self):
        v = open('%s' % self.holdout_pred, 'r')
        y_pred_holdout = []
        for line in v:
            y_pred_holdout.append(float(line.split()[0].strip()))

        if self.outer_loss_function == 'logistic':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            loss = log_loss(self.y_true_holdout, y_pred_holdout_proba)

        elif self.outer_loss_function == 'squared':
            loss = mean_squared_error(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'hinge':
            loss = hinge_loss(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'pr-auc':
            loss = -average_precision_score(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'roc-auc':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            fpr, tpr, _ = roc_curve(self.y_true_holdout, y_pred_holdout_proba)
            loss = -auc(fpr, tpr)

        self.logger.info('parameter suffix: %s' % self.param_suffix)
        self.logger.info('loss value: %.6f' % loss)

        return loss
Ejemplo n.º 17
0
    def validation_metric_vw(self):
        v = open('%s' % self.holdout_pred, 'r')
        y_pred_holdout = []
        for line in v:
            y_pred_holdout.append(float(line.split()[0].strip()))

        if self.outer_loss_function == 'logistic':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            loss = log_loss(self.y_true_holdout, y_pred_holdout_proba)

        elif self.outer_loss_function == 'squared':
            loss = mean_squared_error(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'hinge':
            loss = hinge_loss(self.y_true_holdout, y_pred_holdout)

        elif self.outer_loss_function == 'pr-auc':
            loss = -average_precision_score(self.y_true_holdout,
                                            y_pred_holdout)

        elif self.outer_loss_function == 'roc-auc':
            y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_pred_holdout]
            fpr, tpr, _ = roc_curve(self.y_true_holdout, y_pred_holdout_proba)
            loss = -auc(fpr, tpr)

        else:
            raise KeyError('Invalide outer loss function')

        self.logger.info('parameter suffix: %s' % self.param_suffix)
        self.logger.info('loss value: %.6f' % loss)

        return loss
Ejemplo n.º 18
0
def calculate_accuracy(train_x, train_y, test_x, test_y):
    loss_list = ['benign', 'dos', 'probe', 'u2r', 'r2l']
    model.fit(train_x, train_y)
    model_predicted = model.predict(test_x)
    model_predicted = pd.DataFrame(model_predicted)
    model_predicted.columns = test_y.columns

    from sklearn.metrics import precision_score, mean_squared_error, f1_score, hinge_loss
    precision_score = precision_score(test_y, model_predicted, average='micro')
    print('The precision at  is ', precision_score)
    class_proba = model.predict_proba(test_x)
    class_proba_managed = pd.DataFrame()
    # temp=class_proba[0]
    # [row[1] for row in temp]
    class_proba_managed = class_proba[1]
    class_proba_managed = pd.DataFrame(class_proba_managed)
    class_proba_managed.columns = test_y.columns

    loss_function = []

    for columns in test_y.columns:
        loss_function.append(
            hinge_loss(test_y[columns], class_proba_managed[columns]))

    average_loss = (sum(loss_function)) / 2
    print("The average loss is", average_loss)
Ejemplo n.º 19
0
def h_loss(estimator, X_test, y_test):
    "hinge loss"

    y_predicted = estimator.predict(X_test)

    score = -hinge_loss(y_test, y_predicted)

    return score
Ejemplo n.º 20
0
def print_metrics(y_true, y_preds):
    '''
    Description: print out accuracy, recall, precision, hinge loss, and f1-score of model
    '''
    print "Accuracy: %.4g" % metrics.accuracy_score(y_true, y_preds, normalize=True)
    print "Recall: %.4g" % metrics.recall_score(y_true, y_preds)
    print "Precision: %.4g" % metrics.precision_score(y_true, y_preds)
    print "Hinge loss: %.4g" % metrics.hinge_loss(y_true, y_preds)
    print "F1 score: %.4g" % metrics.f1_score(y_true, y_preds)
Ejemplo n.º 21
0
def svm_run(filters,
            c_range=1.0,
            kernel_type='rbf',
            gamma='auto',
            train_sizes=[15, 100, 300, 500, 800],
            table_folder="/",
            save_file=None,
            time_from=32,
            time_to=8,
            downsample_ratio=None,
            oversample=None):
    timesteps = time_from - time_to

    X_train, X_test, y_train, y_test, churn_number, total_number, feature_names = import_and_preprocess_table(
        timesteps, time_from, time_to, filters, table_folder, downsample_ratio,
        oversample)

    X_train = list(map(lambda x: x.flatten(), X_train))
    X_test = list(map(lambda x: x.flatten(), X_test))

    clf = svm.SVC(kernel=kernel_type, gamma=gamma, C=c_range)

    # train_sizes, train_scores, validation_scores = learning_curve(clf, X_train, y_train, train_sizes=train_sizes, cv=5, shuffle=True, scoring='f1')
    train_sizes, train_scores, validation_scores = training_curve(
        clf,
        X_train,
        y_train,
        X_test,
        y_test,
        train_sizes=train_sizes,
        shuffle=True,
        scoring='precision',
        train_last=True)

    # print(train_scores, valid_scores)
    clf.fit(X_train, y_train)

    # cross_val_score(clf, X_train, y_train, scoring='recall_macro', cv=5)
    y_pred = clf.predict(X_test)
    if kernel_type == 'linear':
        feature_importances = clf.coef_.flatten()
    else:
        feature_importances = []
    scores = [
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        hinge_loss(y_test, y_pred),
        f1_score(y_test, y_pred)
    ]

    # print_feature_importances(feature_importances, feature_names, all=True)
    print(y_pred)
    return [
        y_pred, y_test, feature_importances, scores, train_sizes, train_scores,
        validation_scores, churn_number, total_number, feature_names
    ]
Ejemplo n.º 22
0
def test_hinge_loss_binary():
    y_true = np.array([-1, 1, 1, -1])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision))

    with warnings.catch_warnings():
        # Test deprecated pos_label
        assert_equal(
            hinge_loss(-y_true, pred_decision),
            hinge_loss(y_true, pred_decision, pos_label=-1, neg_label=1))

    y_true = np.array([0, 2, 2, 0])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision))
    with warnings.catch_warnings():
        # Test deprecated pos_label
        assert_equal(1.2 / 4, hinge_loss(y_true, pred_decision,
                                         pos_label=2, neg_label=0))
Ejemplo n.º 23
0
    def class_cost(self, X, Yc):
        """Calculates the current cost of classification"""
        predictions = [
            x @ wc + rho for x, wc, rho in zip(X, self.Wc, self.rho)
        ]

        h_loss = sum([hinge_loss(y, pred) for y, pred in zip(Yc, predictions)])

        return h_loss
Ejemplo n.º 24
0
    def train(self, kernel_type):
        print("--- Training {} SVM with Gamma = {} ---".format(
            kernel_type, self.bestGamma))
        start_time = datetime.now()

        # Build the SVM with linear/RBF kernel
        clf = self.classifier(kernel_=kernel_type,
                              gamma_=self.bestGamma,
                              verbose_=VERBOSE)
        clf.fit(self.trainX, self.trainY)

        time = datetime.now() - start_time
        print("Finish training in ", time)

        # Compute the loss of the SVM on the training set and test set
        pred_decision_train = clf.decision_function(self.trainX)
        loss_train = hinge_loss(self.trainY, pred_decision_train)

        pred_decision_test = clf.decision_function(self.testX)
        loss_test = hinge_loss(self.testY, pred_decision_test)
        print("=> Loss in training set: {:.4f}".format(loss_train))
        print("=> Loss in test set: {:.4f}".format(loss_test))

        # Compute the accuray
        predY_train = clf.predict(self.trainX)
        predY = clf.predict(self.testX)
        acc_train = accuracy_score(self.trainY, predY_train)

        predY_test = clf.predict(self.testX)
        predY = clf.predict(self.testX)
        acc_test = accuracy_score(self.testY, predY_test)
        print("=> Accuracy in training set: {:.4f}".format(acc_train))
        print("=> Accuracy in test set: {:.4f}".format(acc_test))
        # save the well trained classifier

        # plot AUC
        fpr, tpr, _ = roc_curve(self.testY, pred_decision_test)
        AUC = auc(fpr, tpr)
        #plot_auc_curve(fpr, tpr, AUC)
        cm = confusion_matrix(self.testY, predY)
        plot_confusion_matrix(cm, ["Class 0", "Class 1"],
                              title=self.dataset_name)

        self.clf = clf
Ejemplo n.º 25
0
def check_vb(datanm, samples_per_class, Cs, num_classes, gamma, num_iter = 100, kernel = 'linear', strat = 'ovr'):
    data, labels = load_full(datanm, samples_per_class)
    slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.5, train_size=0.5, random_state=None)
    ans = np.zeros((len(Cs), len(gamma), samples_per_class/2, 4))
    for train_index, test_index in slo:
        train_data = [data[train_index, :], labels[train_index]]
        valid_data = [data[test_index , :], labels[test_index ]]

        for l in xrange(samples_per_class/2):
            ind_train = []
            ind_valid = []
            for k in xrange(num_classes):
                ind_train = ind_train + np.where(train_data[1] == k)[0].tolist()[:l+1]
                ind_valid = ind_valid + np.where(valid_data[1] == k)[0].tolist()[:l+1]

            ctrain_data = [ train_data[0][ind_train], train_data[1][ind_train] ]
            cvalid_data = [ valid_data[0][ind_valid], valid_data[1][ind_valid] ]

            for i, C in enumerate(Cs):
                for j, g in enumerate(gamma):
                    clf = svm.SVC(C=C, kernel=kernel, degree=3, gamma=g, coef0=0.0, shrinking=True,
                                  probability=False, tol=0.001,  cache_size=10000, class_weight=None,
                                  verbose=False, max_iter=-1, decision_function_shape=strat, random_state=None)
                    clf.fit(ctrain_data[0], ctrain_data[1])

                    #out_train = clf.predict_proba(ctrain_data[0])
                    #out_valid = clf.predict_proba(cvalid_data[0])

                    #ans[i, l, 0] += log_loss(ctrain_data[1], out_train)
                    #ans[i, l, 1] += log_loss(cvalid_data[1], out_valid)
                    
                    out_train = clf.decision_function(train_data[0])
                    out_valid = clf.decision_function(valid_data[0])

                    ans[i, j, l, 2] += hinge_loss(train_data[1], out_train, range(num_classes))
                    ans[i, j, l, 3] += hinge_loss(valid_data[1], out_valid, range(num_classes))

    ans /= num_iter

    np.savez("svm_bv_" + kernel + '_' + strat, ans= ans, Cs = Cs, num_iter = num_iter, num_classes = num_classes, samples_per_class = samples_per_class)
    return ans
Ejemplo n.º 26
0
 def get_soft_linear_svm_w_b(self, subset_c):
     x = self.X[subset_c]
     y = self.Y[subset_c]
     reg_par = float(1) / (2.0 * self.lamb * subset_c.shape[0])
     model = svm.SVC(kernel='linear', C=reg_par)
     model.fit(x, y)
     y_pred = model.decision_function(x)
     w = model.coef_
     reg = self.lamb * (subset_c.shape[0]) * np.dot(w, w.T)[0][0]
     hinge_machine_loss = hinge_loss(y, y_pred)
     hinge_machine_loss *= y_pred.shape[0]
     return reg + hinge_machine_loss
Ejemplo n.º 27
0
def optimizelinearsvc(sigma, knn, penalty, tolerance):  #using the package
    x1 = [[0] * D for k in range(knn)]
    lossfunction = [0] * knn
    totalloss = 0
    beta = [0] * knn
    beta0 = [0] * knn
    x_training = [[] for k in range(knn)]
    y_training = [[] for k in range(knn)]
    a_training = [0] * knn
    #    print(np.array(x_training).shape)
    #
    #    print(np.array(y_training).shape)
    for k in range(knn):
        for i in range(N):
            if sigma[i][k] >= 0.5:
                x_training[k].append(x[i])
                y_training[k].append(y[i])

#    index = [[0]*J for k in range(knn)]
#    indicator = [0]* knn
#    for k in range(knn):
#        for j in range(J):
#            for i in range(N):
#                if y_training[k][i]==j:
#                    index[k][j] = 1
#                    break

    for k in range(knn):

        lin_clf = LinearSVC(C=penalty, tol=tolerance)
        lin_clf.fit(x_training[k], y_training[k])
        beta[k] = lin_clf.coef_
        beta0[k] = lin_clf.intercept_
        a_training[k] = accuracy_score(y_training[k],
                                       lin_clf.predict(x_training[k]))
        b_training = lin_clf.decision_function(x_training[k])
        lossfunction[k] = hinge_loss(y_training[k], b_training)

    for k in range(knn):
        totalloss = totalloss + a_training[k] * sum(sigma[i][k]
                                                    for i in range(N))


#    print ('accuracy', accuracy_score(vay, lin_clf.predict(x_testing_extraction)))

    for k in range(knn):
        for d in range(D):
            x1[k][d] = sum(sigma[i][k] * x[i][d]
                           for i in range(N)) / sum(sigma[i][k]
                                                    for i in range(N))

    return a_training, beta, beta0, x1, lossfunction, totalloss / N
Ejemplo n.º 28
0
def prediction(X_train, X_test, y_train, y_test):
    [classifier_names, classifiers] = build_classifiers()
    for cidx, clf_name in enumerate(classifier_names):
        clf = classifiers[cidx].fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        if hasattr(clf, "decision_function"):
            pred_decision = clf.decision_function(X_test)
        else:
            pred_decision = clf.predict_proba(X_test)#[:, 1]
        performances = [cohen_kappa_score(y_test, y_pred), hinge_loss(y_test, pred_decision), matthews_corrcoef(y_test, y_pred)]
        print("%s\t cohen_kappa_score: %.2f\t hinge_loss: %.2f\tmatthews_corrcoef:%.2f " % (clf_name, performances[0], performances[1], performances[2]))
        cm = confusion_matrix(y_test, y_pred)
        return ["%.2f" % item for item in performances], cm
Ejemplo n.º 29
0
    def cross_validation(self, model):
        # kfold = cross_validation.KFold(self.train_x.shape[0], n_folds=5, shuffle=True, random_state=self.random_state)
        kfold = cross_validation.StratifiedKFold(
            self.train_y,
            n_folds=self.k_fold_,
            shuffle=True,
            random_state=self.random_state)
        scores = {
            'auc': list(),
            'hinge_loss': list(),
            'log_loss': list(),
            'accuracy': list(),
            'precision': list(),
            'recall': list(),
            'f1_value': list()
        }
        #scores = list()
        preds = np.zeros(len(self.train_y))
        i = 0
        for train_idx, test_idx in kfold:
            print(' --------- fold {0} ---------- '.format(i))
            train_x = self.train_x.iloc[
                train_idx]  # 明示的にindex, columsを番号で指定したい, sinhrks.hatenablog.com/entry/2014/11/12/233216
            train_y = self.train_y[train_idx]
            test_x = self.train_x.iloc[test_idx]
            test_y = self.train_y[test_idx]
            model.fit(train_x, train_y)
            pred = model.predict(test_x)
            score = metrics.roc_auc_score(test_y, pred)
            preds[test_idx] = pred

            score = metrics.roc_auc_score(test_y, pred)  # auc
            scores['auc'].append(score)
            score = metrics.hinge_loss(test_y, pred)  # hinge_loss
            scores['hinge_loss'].append(score)
            score = metrics.log_loss(test_y, pred)  # log_loss
            scores['log_loss'].append(score)
            #score = metrics.accuracy_score(test_y, pred)# accuracy
            #scores['accuracy'].append(score)
            #score = metrics.precision_score(test_y, pred)# precision
            #scores['precision'].append(score)
            #score = metrics.recall_score(test_y, pred)# recall
            #scores['recall'].append(score)
            #score = metrics.f1_score(test_y, pred)# f_value
            #scores['f1_value'].append(score)
            i += 1
        for key in scores.keys():
            scores[key] = np.asarray(scores[key], dtype=np.float32)

        #print scores.mean(), scores.std()
        return scores, preds
Ejemplo n.º 30
0
    def compare(self, model, X, y):
        """Compares the score of a sample in two models.
        Returns a crossvalidation of metrics, predictions and score.
        
        :param model: model
        :param X: data
        :type model: MultiModelClassifier
        :type X: ndarray or scipy.sparse matrix, (n_samples, n_features)
        """
        scores = {}
        y_pred = self.predict(X)
        y_pred_prob = self._predict_prob(X)

        other_y_pred = model.predict(X)
        other_y_pred_prob = model._predict_prob(X)

        self._guess_problem(y)

        if self._problem == 'binary':
            #Binary-only metrics

            scores['PreRec'] = (metrics.precision_recall_curve(y, y_pred_prob),
                                metrics.precision_recall_curve(
                                    y, other_y_pred_prob))

            scores['ROC'] = (metrics.roc_curve(y, y_pred_prob),
                             metrics.roc_curve(y, other_y_pred_prob))

        scores['Kappa'] = (metrics.cohen_kappa_score(y, y_pred),
                           metrics.cohen_kappa_score(y, other_y_pred))
        scores['Confusion'] = (metrics.confusion_matrix(y, y_pred),
                               metrics.confusion_matrix(y, other_y_pred))
        scores['HL'] = (metrics.hinge_loss(y, y_pred_prob),
                        metrics.hinge_loss(y, other_y_pred_prob))
        scores['MCC'] = (metrics.matthews_corrcoef(y, y_pred),
                         metrics.matthews_corrcoef(y, other_y_pred))

        return scores
Ejemplo n.º 31
0
def linear_test(iris_X, iris_y):
    optimal_c, iris_X_train, iris_y_train, iris_X_valid, iris_y_valid, iris_X_test, iris_y_test = find_c(
        iris_X, iris_y)

    # Test loss and accuracy for optimal c
    svc = svm.SVC(kernel='linear', C=optimal_c)
    svc.fit(iris_X_train, iris_y_train)
    predictions = svc.predict(iris_X_test)
    test_score = accuracy_score(iris_y_test, predictions)
    prediction_dec = svc.decision_function(iris_X_test)
    h_loss_t = hinge_loss(iris_y_test, prediction_dec)
    print(" Linear>>>>")
    print("Testing Score and loss for Optimal C= {} is : {}, {} \n".format(
        optimal_c, test_score * 100.0, h_loss_t))
Ejemplo n.º 32
0
def apply_model_linear(iris_X_train, iris_y_train, iris_X_valid, iris_y_valid,
                       c):
    svc = svm.SVC(kernel='linear', C=c)
    svc.fit(iris_X_train, iris_y_train)

    # Validation loss and accuracy
    predictions = svc.predict(iris_X_valid)
    valid_score = accuracy_score(iris_y_valid, predictions)
    prediction_dec = svc.decision_function(iris_X_valid)
    h_loss_v = hinge_loss(iris_y_valid, prediction_dec)
    #
    # print(
    #     "Validation Score and loss for C= {} is : {}, {}".format(c, valid_score * 100.0, h_loss_v))
    return h_loss_v
Ejemplo n.º 33
0
def feature_update(x,v,j,Z,phi_prototypes,prototypes,labels,bag_index):
    lamb = 1                                          #NOTA: Este Lambda de donde?. 
    Z_copy= deepcopy(Z)
    #phi_prototypes_prim = deepcopy(phi_prototypes)
    #phi_prototypes_prim[bag_index] = j
    #prototypes_prim[bag_index] = deepcopy(train_bags[bag_index][j])
    for i_index_bag in range (0,len(train_bags)):     #Para Cada Bolsa.
        Z_copy[i_index_bag,j] = np.exp(-lamb * _min_hau_bag(train_bags[i_index_bag],[x]))
        pred_decision = lin_svc.decision_function(Z_copy)
        v_prim = hinge_loss(labels, pred_decision)
        if v_prim > v: 
            v_prim = np.inf
            break
    return v_prim, Z_copy, phi_prototypes_prim
Ejemplo n.º 34
0
def calculate_loss(y_true, y_score, outer_loss_function):
    if outer_loss_function == 'logistic':
        y_pred_holdout_proba = [1. / (1 + exp(-i)) for i in y_score]
        loss = log_loss(y_true, y_pred_holdout_proba)
    elif outer_loss_function == 'squared':
        loss = mean_squared_error(y_true, y_score)
    elif outer_loss_function == 'hinge':
        loss = hinge_loss(y_true, y_score)
    elif outer_loss_function == 'pr-auc':
        loss = -average_precision_score(y_true, y_score)
    elif outer_loss_function == 'roc-auc':
        fpr, tpr, _ = roc_curve(y_true, y_score)
        loss = 1 - auc(fpr, tpr)
    return loss
Ejemplo n.º 35
0
    def get_soft_kernel_svm_w_b(self, subset_c):
        x = self.X[subset_c]
        y = self.Y[subset_c]
        reg_par = float(1) / (2.0 * self.lamb * subset_c.shape[0])
        model = svm.SVC(C=reg_par, kernel='poly', degree=2, gamma='auto')
        model.fit(x, y)
        coef = model.dual_coef_
        sv = model.support_vectors_
        w = np.dot(coef, sv)
        reg = self.lamb * (subset_c.shape[0]) * np.dot(w, w.T)[0][0]
        y_pred = model.decision_function(x)
        hinge_machine_loss = hinge_loss(y, y_pred)
        hinge_machine_loss *= y_pred.shape[0]

        return reg + hinge_machine_loss
Ejemplo n.º 36
0
    def cross_validation(self, model):
        # kfold = cross_validation.KFold(self.train_x.shape[0], n_folds=5, shuffle=True, random_state=self.random_state)
        kfold = cross_validation.StratifiedKFold(self.train_y, 
                                                 n_folds=self.k_fold_, 
                                                 shuffle=True, 
                                                 random_state=self.random_state)
        scores = {'auc':list(),
                  'hinge_loss':list(),
                  'log_loss':list(),
                  'accuracy':list(),
                  'precision':list(),
                  'recall':list(),
                  'f1_value':list()}
        #scores = list()
        preds = np.zeros(len(self.train_y))
        i = 0
        for train_idx, test_idx in kfold:
            print (' --------- fold {0} ---------- '.format(i))
            train_x = self.train_x.toarray()[train_idx]
            train_y = self.train_y[train_idx]
            test_x = self.train_x.toarray()[test_idx]
            test_y = self.train_y[test_idx]
            model.fit(train_x, train_y)
            pred = model.predict(test_x)
            score = metrics.roc_auc_score(test_y, pred)
            preds[test_idx] = pred

            score = metrics.roc_auc_score(test_y, pred)# auc
            scores['auc'].append(score)
            score = metrics.hinge_loss(test_y, pred)# hinge_loss
            scores['hinge_loss'].append(score)
            score = metrics.log_loss(test_y, pred)# log_loss
            scores['log_loss'].append(score)
            #score = metrics.accuracy_score(test_y, pred)# accuracy
            #scores['accuracy'].append(score)
            #score = metrics.precision_score(test_y, pred)# precision
            #scores['precision'].append(score)
            #score = metrics.recall_score(test_y, pred)# recall
            #scores['recall'].append(score)
            #score = metrics.f1_score(test_y, pred)# f_value
            #scores['f1_value'].append(score)
            i += 1

        for key in scores.keys():
            scores[key] = np.asarray(scores[key], dtype=np.float32)
            #print key, scores[key].mean(), scores[key].std()
        return scores, preds
def learn_SVM(X_train,
              Y_train,
              X_test,
              Y_test,
              kernel='linear',
              C=1,
              gamma=None,
              print_result=False,
              print_all=True):
    """ train an SVM model from extracted characteristics of a signal
    INPUT : listes train/test
    OUTPUT : training score, confusion matrix
    """

    # initialization of the SVM model
    # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    if gamma is not None:
        SVM_model = svm.SVC(kernel=kernel,
                            C=C,
                            gamma=gamma,
                            class_weight='balanced')
    else:
        SVM_model = svm.SVC(kernel=kernel, C=C, class_weight='balanced')

    # training the model
    SVM_model.fit(X_train, Y_train)
    pred_decision = SVM_model.decision_function(X_test)
    loss = hinge_loss(y_true=Y_test, pred_decision=pred_decision)

    assert print_result in [True, False]
    # if True, display the score of the trained model on the test-set and the confusion matrix
    if print_result:
        score, confusion_matrix = print_SVM_results(SVM_model,
                                                    X_test,
                                                    Y_test,
                                                    kernel=kernel,
                                                    C=C,
                                                    gamma=gamma,
                                                    print_all=print_all)
    else:
        # testing the model
        score = SVM_model.score(X_test, Y_test)

        confusion_matrix = None

    return SVM_model, loss, score, confusion_matrix
Ejemplo n.º 38
0
def train_svm_poly(X_trn, y_trn, l, P, g):
    splits = 5
    kf = KFold(n_splits=splits, shuffle=True)
    clf = svm.SVC(kernel='poly', C=1 / (2 * l), degree=P, coef0=g, gamma=1)
    sum_hinge_loss = 0
    for train_index, test_index in kf.split(X_trn):
        # Split train-test
        X_train, X_test = X_trn[train_index], X_trn[test_index]
        y_train, y_test = y_trn[train_index], y_trn[test_index]

        # Train the model
        clf.fit(X_train, y_train)
        predictions = clf.decision_function(X_test)

        sum_hinge_loss += hinge_loss(y_test, predictions)
    avg_hinge_loss = sum_hinge_loss / splits
    return avg_hinge_loss
Ejemplo n.º 39
0
def apply_model_rbf(iris_X_train, iris_y_train, iris_X_valid, iris_y_valid, c,
                    g, count, to_plot):
    svc = svm.SVC(kernel='rbf', gamma=g, C=c)
    svc.fit(iris_X_train, iris_y_train)

    # Validation loss and accuracy
    predictions = svc.predict(iris_X_valid)
    valid_score = accuracy_score(iris_y_valid, predictions)
    prediction_dec = svc.decision_function(iris_X_valid)
    h_loss_v = hinge_loss(iris_y_valid, prediction_dec)
    if to_plot:
        p = plot_helper.plot_helper(iris_X_train, iris_y_train, c, g, svc,
                                    count)
        p.plot()
    # print(
    #     "Validation Score and loss for C= {} and gamma={} is : {}, {}".format(c, g, valid_score * 100.0, h_loss_v))
    return h_loss_v
Ejemplo n.º 40
0
    def get_soft_linear_svm_w(self, subset_c):
        x = self.X[subset_c]
        y = self.Y[subset_c]

        reg_par = float(1) / (2.0 * self.lamb * subset_c.shape[0])
        model = svm.LinearSVC(fit_intercept=False, C=reg_par, loss='hinge')
        model.fit(x, y)
        y_pred = model.decision_function(x)

        w = model.coef_
        b = model.intercept_
        assert (b == 0)
        reg = self.lamb * (subset_c.shape[0]) * np.dot(w, w.T)[0][0]

        hinge_machine_loss = hinge_loss(y, y_pred)
        hinge_machine_loss *= y_pred.shape[0]
        return reg + hinge_machine_loss
Ejemplo n.º 41
0
    def cross_validation(self, model):

        kfold = cross_validation.StratifiedKFold(self.train_y, 
                                                 n_folds=self.k_fold_, 
                                                 shuffle=True, 
                                                 random_state=self.random_state)
        scores = {'auc':list(),
                  'hinge_loss':list(),
                  'log_loss':list(),
                  'accuracy':list(),
                  'precision':list(),
                  'recall':list(),
                  'f1_value':list()}
        #scores = list()
        preds = np.zeros(len(self.train_y))
        i = 0
        for train_idx, test_idx in kfold:
            print (' --------- fold {0} ---------- '.format(i))
            train_x = self.train_x.iloc[train_idx] # 明示的にindex, columsを番号で指定したい, sinhrks.hatenablog.com/entry/2014/11/12/233216
            train_y = self.train_y[train_idx]
            test_x = self.train_x.iloc[test_idx]
            test_y = self.train_y[test_idx]
            model.fit(train_x, train_y)
            pred = model.predict(test_x)
            preds[test_idx] = pred

            score = metrics.roc_auc_score(test_y, pred)# auc
            scores['auc'].append(score)
            score = metrics.hinge_loss(test_y, pred)# hinge_loss
            scores['hinge_loss'].append(score)
            score = metrics.log_loss(test_y, pred)# log_loss
            scores['log_loss'].append(score)
            #score = metrics.accuracy_score(test_y, pred)# accuracy
            #scores['accuracy'].append(score)
            #score = metrics.precision_score(test_y, pred)# precision
            #scores['precision'].append(score)
            #score = metrics.recall_score(test_y, pred)# recall
            #scores['recall'].append(score)
            #score = metrics.f1_score(test_y, pred)# f_value
            #scores['f1_value'].append(score)
            i += 1
        for key in scores.keys():
            scores[key] = np.asarray(scores[key], dtype=np.float32)
        #print scores.mean(), scores.std()
        return scores, preds
Ejemplo n.º 42
0
    def getResult(self, predict, data_set):
        y_true, y_predict = control.calculate_entire_ds(predict, data_set)
        result = metrics.classification_report(y_true, y_predict)
        result += "\nAccuracy classification: %f\n" % metrics.accuracy_score(y_true, y_predict)
        result += "F1 score: %f\n" % metrics.f1_score(y_true, y_predict)
        result += "Fbeta score: %f\n" % metrics.fbeta_score(y_true, y_predict, beta=0.5)
        result += "Hamming loss: %f\n" % metrics.hamming_loss(y_true, y_predict)
        result += "Hinge loss: %f\n" % metrics.hinge_loss(y_true, y_predict)
        result += "Jaccard similarity: %f\n" % metrics.jaccard_similarity_score(y_true, y_predict)
        result += "Precision: %f\n" % metrics.precision_score(y_true, y_predict)
        result += "Recall: %f\n" % metrics.recall_score(y_true, y_predict)

        if self.is_binary():
            result += "Average precision: %f\n" % metrics.average_precision_score(y_true, y_predict)
            result += "Matthews correlation coefficient: %f\n" % metrics.matthews_corrcoef(y_true, y_predict)
            result += "Area Under the Curve: %f" % metrics.roc_auc_score(y_true, y_predict)

        return result
Ejemplo n.º 43
0
def run_question_17_svm(x_trn,y_trn,c = [2,20,200],gamma = [1,0.01,0.001],kernel='rbf'):
    
    for penalty in c:
        for g in gamma:
            hinge_losses = []

            kf = KFold(n_splits=5, shuffle=True, random_state=3815)
            for train_index, test_index in kf.split(x_trn):

                x_trn_5, x_tst_5 = x_trn[train_index], x_trn[test_index]
                y_trn_5, y_tst_5 = y_trn[train_index], y_trn[test_index]

                clf = svm.SVC(kernel = kernel, C = 1/penalty, gamma = g)
                clf.fit(x_trn_5,y_trn_5)
                y_pred = clf.decision_function(x_tst_5)
                hinge_losses.append(hinge_loss(y_tst_5,y_pred))
            mean_hinge_loss = (sum(hinge_losses)/5)
            print("The mean hinge loss with 5-Fold cross validation using hinge loss with lambda = " + str(penalty)  + " and gamma = " + str(g) + " is: " , mean_hinge_loss)
Ejemplo n.º 44
0
def test_hinge_loss_multiclass_with_missing_labels():
    pred_decision = np.array([
        [0.36, -0.17, -0.58, -0.99],
        [-0.55, -0.38, -0.48, -0.58],
        [-1.45, -0.58, -0.38, -0.17],
        [-0.55, -0.38, -0.48, -0.58],
        [-1.45, -0.58, -0.38, -0.17]
    ])
    y_true = np.array([0, 1, 2, 1, 2])
    labels = np.array([0, 1, 2, 3])
    dummy_losses = np.array([
        1 - pred_decision[0][0] + pred_decision[0][1],
        1 - pred_decision[1][1] + pred_decision[1][2],
        1 - pred_decision[2][2] + pred_decision[2][3],
        1 - pred_decision[3][1] + pred_decision[3][2],
        1 - pred_decision[4][2] + pred_decision[4][3]
    ])
    dummy_losses[dummy_losses <= 0] = 0
    dummy_hinge_loss = np.mean(dummy_losses)
    assert_equal(hinge_loss(y_true, pred_decision, labels=labels),
                 dummy_hinge_loss)
Ejemplo n.º 45
0
def test_hinge_loss_multiclass():
    pred_decision = np.array([
        [0.36, -0.17, -0.58, -0.99],
        [-0.54, -0.37, -0.48, -0.58],
        [-1.45, -0.58, -0.38, -0.17],
        [-0.54, -0.38, -0.48, -0.58],
        [-2.36, -0.79, -0.27,  0.24],
        [-1.45, -0.58, -0.38, -0.17]
    ])
    y_true = np.array([0, 1, 2, 1, 3, 2])
    dummy_losses = np.array([
        1 - pred_decision[0][0] + pred_decision[0][1],
        1 - pred_decision[1][1] + pred_decision[1][2],
        1 - pred_decision[2][2] + pred_decision[2][3],
        1 - pred_decision[3][1] + pred_decision[3][2],
        1 - pred_decision[4][3] + pred_decision[4][2],
        1 - pred_decision[5][2] + pred_decision[5][3]
    ])
    dummy_losses[dummy_losses <= 0] = 0
    dummy_hinge_loss = np.mean(dummy_losses)
    assert_equal(hinge_loss(y_true, pred_decision),
                 dummy_hinge_loss)
def evaluate(estimator, dev_X, dev_y):
    print('evaluating on development set', flush=True)
    guess_dev = estimator.predict(dev_X)
    score_roc_auc_dev = roc_auc_score(dev_y, guess_dev)
    print('{:.4f} -- roc auc'.format(score_roc_auc_dev))
    score_brier_loss_dev = brier_score_loss(dev_y, guess_dev)
    print('{:.4f} -- brier loss'.format(score_brier_loss_dev))
    score_log_loss_dev = log_loss(dev_y, estimator.predict_proba(dev_X))
    print('{:.4f} -- log loss'.format(score_log_loss_dev))
    guess_dev_negative_one = guess_dev.copy().astype('int8')
    guess_dev_negative_one[guess_dev_negative_one == 0] = -1
    '''
    decision_fuction not implemented
    # score_hinge_loss_dev = hinge_loss(dev_y, estimator.decision_function(dev_X))
    '''
    score_hinge_loss_dev = hinge_loss(dev_y, guess_dev_negative_one)
    print('{:.4f} -- hinge loss'.format(score_hinge_loss_dev))
    score_matthews_corrcoef_dev = matthews_corrcoef(dev_y, guess_dev_negative_one)
    print('{:.4f} -- matthews_corrcoef'.format(score_matthews_corrcoef_dev))
    print(flush=True)

    return score_roc_auc_dev, score_brier_loss_dev,\
        score_log_loss_dev, score_hinge_loss_dev, score_matthews_corrcoef_dev
Ejemplo n.º 47
0
                   param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8)

Clf.fit(TrainFvs,TrainLabels)
PredictedLabels = Clf.predict(TestFvs)
print '*'*100
print 'classification report'
print '-'*20
Accuracy = np.mean(PredictedLabels == TestLabels)
print "Test Set Accuracy = ", Accuracy

print(metrics.classification_report(TestLabels,
            PredictedLabels, target_names=['Neg', 'Pos']))

print "Accuracy classification score:", metrics.accuracy_score(TestLabels, PredictedLabels)
print "Hamming loss:", metrics.hamming_loss(TestLabels, PredictedLabels)
print "Average hinge loss:", metrics.hinge_loss(TestLabels, PredictedLabels)
print "Log loss:", metrics.log_loss(TestLabels, PredictedLabels)
print "F1 Score:", metrics.f1_score(TestLabels, PredictedLabels)
print "Zero-one classification loss:", metrics.zero_one_loss(TestLabels, PredictedLabels)
print '*'*100




print 'total vocab size: {} '.format(len(model.vocab.keys()))

# for k,v in model.vocab.iteritems():
#         print k
#         print v
#         raw_input()
Ejemplo n.º 48
0
def main_func(datanm, samples_per_class, C, num_classes, gamma, num_iter = 100, kernel = 'linear', strat = 'ovr'):
    data, labels = load_full(datanm, samples_per_class)
    slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.3, train_size=0.7, random_state=None)
    recall = np.zeros((num_classes+1, 2))
    precision = np.zeros((num_classes+1, 2))
    f1 = np.zeros((num_classes+1, 2))
    accuracy = np.zeros((2))
    logloss = np.zeros((2))
    hingeloss = np.zeros((2))

    
    for train_index, test_index in slo:
        train_data = [data[train_index, :], labels[train_index]]
        valid_data = [data[test_index , :], labels[test_index ]]

        clf = svm.SVC(C=C, kernel=kernel, degree=3, gamma=gamma, coef0=0.0, shrinking=True,
                      probability=False, tol=0.001,  cache_size=10000, class_weight=None,
                      verbose=False, max_iter=-1, decision_function_shape=strat, random_state=None)
        clf.fit(train_data[0], train_data[1])

        #out_train = clf.predict_proba(train_data[0])
        #out_valid = clf.predict_proba(valid_data[0])

        #logloss[0] += log_loss(train_data[1], out_train)
        #logloss[1] += log_loss(valid_data[1], out_valid)

        out_train = clf.decision_function(train_data[0])
        out_valid = clf.decision_function(valid_data[0])

        hingeloss[0] += hinge_loss(train_data[1], out_train)
        hingeloss[1] += hinge_loss(valid_data[1], out_valid)

        out_train = clf.predict(train_data[0])
        out_valid = clf.predict(valid_data[0])

        accuracy[0] += accuracy_score(train_data[1], out_train)
        accuracy[1] += accuracy_score(valid_data[1], out_valid)

        precision[:-1, 0] += precision_score(train_data[1], out_train, average = None)
        precision[-1, 0] += precision_score(train_data[1], out_train, average = 'macro')
        precision[:-1, 1] += precision_score(valid_data[1], out_valid, average = None)
        precision[-1, 1] += precision_score(valid_data[1], out_valid, average = 'macro')

        recall[:-1, 0] += recall_score(train_data[1], out_train, average = None)
        recall[-1, 0] += recall_score(train_data[1], out_train, average = 'macro')
        recall[:-1, 1] += recall_score(valid_data[1], out_valid, average = None)
        recall[-1, 1] += recall_score(valid_data[1], out_valid, average = 'macro')

        f1[:-1, 0] += f1_score(train_data[1], out_train, average = None)
        f1[-1, 0] += f1_score(train_data[1], out_train, average = 'macro')
        f1[:-1, 1] += f1_score(valid_data[1], out_valid, average = None)
        f1[-1, 1] += f1_score(valid_data[1], out_valid, average = 'macro')

    f1 /= num_iter
    recall  /= num_iter
    precision  /= num_iter
    logloss  /= num_iter
    accuracy  /= num_iter

    np.savez("svm_final_" + kernel + '_' + strat, accuracy = accuracy, recall = recall, f1 = f1,
                             precision = precision, logloss = logloss, C = C,
                             num_iter = num_iter, num_classes = num_classes,
                             samples_per_class = samples_per_class,
                             hingeloss = hingeloss)
    return [accuracy, recall, f1, precision, logloss, hingeloss]
Ejemplo n.º 49
0
def train_svm(train_data, valid_data, test_data, model_dir, C=1.0, kernel='rbf',
              num_classes=10, tol=0.001, max_iterations=-1, verbose=False,
              random_state=12345678, **kwargs):
    """
    Train a Support Vector Machine model on the given data

    Args:
        X_train: Training feature data
                 (Type: np.ndarray)
        y_train: Training label data
                 (Type: np.ndarray)
        X_test: Testing feature data
                (Type: np.ndarray)
        y_test: Testing label data
                (Type: np.ndarray)

    Keyword Args:
        C: SVM regularization hyperparameter
           (Type: float)

        verbose:  If True, print verbose messages
                  (Type: bool)

    Returns:
        clf: Classifier object
             (Type: sklearn.svm.SVC)

        y_train_pred: Predicted train output of classifier
                     (Type: np.ndarray)

        y_test_pred: Predicted test output of classifier
                     (Type: np.ndarray)
    """
    np.random.seed(random_state)
    random.seed(random_state)

    X_train = train_data['features']
    y_train = train_data['labels']

    model_output_path = os.path.join(model_dir, "model.pkl")

    # Create classifier
    clf = SVC(C=C, probability=True, kernel=kernel, max_iter=max_iterations,
              tol=tol, random_state=random_state, verbose=verbose)

    # Fit data and get output for train and valid batches
    LOGGER.debug('Fitting model to data...')
    clf.fit(X_train, y_train)

    LOGGER.info('Saving model...')
    joblib.dump(clf, model_output_path)

    y_train_pred = clf.predict(X_train)
    # Compute new metrics
    classes = np.arange(num_classes)
    train_loss = hinge_loss(y_train, clf.decision_function(X_train), labels=classes)
    train_metrics = compute_metrics(y_train, y_train_pred, num_classes=num_classes)
    train_metrics['loss'] = train_loss
    train_msg = 'Train - hinge loss: {}, acc: {}'
    LOGGER.info(train_msg.format(train_loss, train_metrics['accuracy']))

    if valid_data:
        X_valid = valid_data['features']
        y_valid = valid_data['labels']
        y_valid_pred = clf.predict(X_valid)
        valid_loss = hinge_loss(y_valid, clf.decision_function(X_valid), labels=classes)
        valid_metrics = compute_metrics(y_valid, y_valid_pred, num_classes=num_classes)
        valid_metrics['loss'] = valid_loss
        valid_msg = 'Valid - hinge loss: {}, acc: {}'
        LOGGER.info(valid_msg.format(valid_loss, valid_metrics['accuracy']))
    else:
        valid_metrics = {}

    # Evaluate model on test data
    if test_data:
        X_test = test_data['features']
        y_test_pred_frame = clf.predict_proba(X_test)
        y_test_pred = []
        for start_idx, end_idx in test_data['file_idxs']:
            class_pred = y_test_pred_frame[start_idx:end_idx].mean(axis=0).argmax()
            y_test_pred.append(class_pred)

        y_test_pred = np.array(y_test_pred)
        test_metrics = compute_metrics(test_data['labels'], y_test_pred, num_classes=num_classes)
    else:
        test_metrics = {}

    return clf, train_metrics, valid_metrics, test_metrics
Ejemplo n.º 50
0
 def test_hinge_loss(self):
     result = self.df.metrics.hinge_loss()
     expected = metrics.hinge_loss(self.target, self.decision)
     self.assertEqual(result, expected)
Ejemplo n.º 51
0
def main():
    # if sys.argv[2] == 'svm':
    #     Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100)
    # elif sys.argv[2] == 'lr':
    #     Clf = LogisticRegression (C=0.1,max_iter=100,n_jobs=8)
    # elif sys.argv[2] == 'pa':
    #     Clf = PassiveAggressiveClassifier(C=0.1,n_iter=1,n_jobs=8,class_weight='balanced')
    # else:
    #     Clf = SGDClassifier(n_iter=1,n_jobs=8,class_weight='balanced')

    Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100)
    Clf = LogisticRegression (C=0.1,max_iter=1000,n_jobs=8,class_weight='balanced')
    Clf = GridSearchCV(LogisticRegression(max_iter=1000,n_jobs=8,class_weight='balanced'), cv=5,
                   param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8)
    # Clf = GridSearchCV(LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=1000), cv=3,
    #                param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8)

    File = '/home/annamalai/Senti/UCI/amazon_cells_labelled.txt'
    Ngram = 2

    print 'Clf: {}, File: {}, ngram: {}'.format(Clf, File, Ngram)


    PosSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('1')]#[:100]
    NegSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('0')]#[:100]
    print 'loaded {} pos and {} neg samples'.format(len(PosSamples), len(NegSamples))
    X = PosSamples + NegSamples
    y = [1 for _ in xrange(len(PosSamples))] + [-1 for _ in xrange (len(NegSamples))]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=random.randint(0,100))
    print '# TrainLabels', len(y_train)
    print '# TestLabels', len(y_test)

    print 'performing CVectorizer'
    CVectorizer = CountVectorizer(lowercase = True,
                                  stop_words='english',
                                  # token_pattern='(?u)\b\w\w+\b',
                                  # tokenizer = SGTokenizer,
                                  tokenizer = Tokenizer,
                                  ngram_range=(1,2),
                                  dtype=np.float64,
                                  decode_error = 'ignore',
                                  max_df=0.8)
    print 'performing TfidfTransformer and Normalizer'
    # TFIDFTransformer = TfidfTransformer()
    normalizer = Normalizer()
    print 'creating Train and Test FVs'
    T0 = time()
    TrainFVs = CVectorizer.fit_transform(X_train)
    TestFVs = CVectorizer.transform(X_test)
    print 'feat ext time', time() - T0

    # TrainFVs = TFIDFTransformer.fit_transform(TrainFVs)
    # TestFVs = TFIDFTransformer.transform(TestFVs)

    TrainFVs = normalizer.fit_transform(TrainFVs)
    TestFVs = normalizer.transform(TestFVs)

    print 'Trai/test split'
    print TrainFVs.shape
    print TestFVs.shape
    # raw_input('hit any key...')

    print 'training classifier with train samples shape:', TrainFVs.shape
    T0 = time()
    # memory_dump('before_train_mem.txt')
    Model = Clf.fit (TrainFVs, y_train) # re-train on current training set (daily)
    print 'batch fitted'
    print 'training time', time() - T0
    # memory_dump('after_train_mem.txt')

    print 'testing classifier with test samples shape:', TestFVs.shape
    T0 = time()
    # memory_dump('before_test_mem.txt')
    PredictedLabels = Clf.predict(TestFVs)
    print 'testing time', time() - T0
    # memory_dump('after_test_mem.txt')

    print '*'*100
    print 'classification report'
    print '-'*20
    Accuracy = np.mean(PredictedLabels == y_test)
    print "Test Set Accuracy = ", Accuracy

    print(metrics.classification_report(y_test,
                PredictedLabels, target_names=['Neg', 'Pos']))

    print "Accuracy classification score:", metrics.accuracy_score(y_test, PredictedLabels)
    print "Hamming loss:", metrics.hamming_loss(y_test, PredictedLabels)
    print "Average hinge loss:", metrics.hinge_loss(y_test, PredictedLabels)
    print "Log loss:", metrics.log_loss(y_test, PredictedLabels)
    print "F1 Score:", metrics.f1_score(y_test, PredictedLabels)
    print "Zero-one classification loss:", metrics.zero_one_loss(y_test, PredictedLabels)
    print '*'*100

    Vocab = CVectorizer.get_feature_names()
    # print Vocab[:100]
    # raw_input()
    try:
        FeatureImportances = Clf.coef_[0]
    except:
        FeatureImportances = Clf.best_estimator_.coef_[0]

    print FeatureImportances.shape
    raw_input()
    PosTopFeatureIndices = FeatureImportances.argsort()[-100:][::-1]
    NegTopFeatureIndices = FeatureImportances.argsort()[:100][::-1]
    for PosFIndex, NegFIndex in zip(PosTopFeatureIndices, NegTopFeatureIndices):
                print Vocab[PosFIndex], '+-', Vocab[NegFIndex]


    FeatureImportancesSparseArray = ssp.lil_matrix((TestFVs.shape[1],TestFVs.shape[1]))
    FeatureImportancesSparseArray.setdiag(FeatureImportances)

    AllFVsTimesW = TestFVs*FeatureImportancesSparseArray
    print AllFVsTimesW.shape

    Ind = 0
    for TestFV in TestFVs:
        if PredictedLabels[Ind] != y_test[Ind]:
            Ind += 1
            continue
        if len(X_test[Ind].split()) < 5:
            Ind += 1
            continue
        print 'Sample: {}, actual label: {}'.format(X_test[Ind], y_test[Ind])
        # print TestFV
        # print TestFV.shape
        CurTestFV = np.array(AllFVsTimesW[Ind].toarray())
        CurTestFV = CurTestFV.transpose()
        CurTestFV = CurTestFV.reshape(CurTestFV.shape[0],)
        # print CurTestFV.shape
        # raw_input()
        PosTopFeatureIndices = CurTestFV.argsort()[-2:][::-1]
        NegTopFeatureIndices = CurTestFV.argsort()[:2][::-1]
        PosFeatImps= CurTestFV.argsort()[-2:]
        NegFeatImps = CurTestFV.argsort()[:2]
        Tmp = AllFVsTimesW[Ind].todense()
        Tmp = np.sort(Tmp)
        # print PosTopFeatureIndices, AllFVsTimesW[Ind].todense().argsort(), Tmp
        # print NegTopFeatureIndices, NegFeatImps
        if y_test[Ind] == 1:
            print 'top postive feats:', colored(', '.join(['['+Vocab[PosFIndex]+']' for PosFIndex in PosTopFeatureIndices]), 'green')

        else:
            print 'top negative feats: ', colored(', '.join (['['+Vocab[NegFIndex]+']' for NegFIndex in NegTopFeatureIndices]), 'red')
        Ind += 1
        raw_input()