Example #1
0
def evaluate_resampling(X, y, X_test, y_test, clf=None):
    """For evaluating various resampling methods"""
    if clf is None:
        clf = RandomForestClassifier(n_estimators=400,
                                     random_state=5,
                                     n_jobs=-1)

    probas = cross_val_predict(
        clf,
        X,
        y,
        cv=StratifiedKFold(n_splits=3),
        n_jobs=-1,
        method="predict_proba",
        verbose=2,
    )
    pred_indices = np.argmax(probas, axis=1)
    classes = np.unique(y)
    preds = classes[pred_indices]

    print("Cross validation on training data: ")
    print("Log loss: {}".format(log_loss(y, probas)))
    print("Accuracy: {}".format(balanced_accuracy_score(y, preds)))
    print("F1 score: {}".format(f1_score(y, preds, average="micro")))

    print("Validation on testing data: ")
    clf.fit(X, y)
    ytest = clf.predict(X_test)
    yprobas_test = clf.predict_proba(X_test)
    print("Log loss: {}".format(log_loss(y_test, yprobas_test)))
    print("Accuracy: {}".format(balanced_accuracy_score(y_test, ytest)))
    print("F1 score: {}".format(f1_score(y_test, ytest, average="micro")))
Example #2
0
def all_metrics(num, model, train, test, target, target_test):
    """ Calculating metric scores for all models"""

    ytrain = model.predict(train)
    yprobas = model.predict_proba(train)

    ytest = model.predict(test)
    yprobas_test = model.predict_proba(test)

    logloss_train = log_loss(target, yprobas)
    logloss_test = log_loss(target_test, yprobas_test)

    print("Training Log Loss: ", logloss_train)
    print("Testing Log Loss: ", logloss_test)

    acc_train = round(balanced_accuracy_score(target, ytrain) * 100, 3)
    acc_test = round(balanced_accuracy_score(target_test, ytest) * 100, 3)

    print("Training Accuracy: ", acc_train)
    print("Testing Accuracy: ", acc_test)

    f1score_train = f1_score(target, ytrain, average="micro")
    f1score_test = f1_score(target_test, ytest, average="micro")

    print("Training f1 Score: ", f1score_train)
    print("Testing f1 Score: ", f1score_test)
def predict_using_random_model(x_test, y_test, x_cv, y_cv):
    # We need to generate 9 numbers and the sum of numbers should be 1
    # one solution is to genarate 9 numbers and divide each of the numbers by their sum
    # ref: https://stackoverflow.com/a/18662466/4084039
    test_data_len = x_test.shape[0]
    cv_data_len = x_cv.shape[0]

    # we create a output array that has exactly same size as the CV data
    cv_predicted_y = np.zeros((cv_data_len, 9))
    for i in range(cv_data_len):
        rand_probs = np.random.rand(1, 9)
        cv_predicted_y[i] = (rand_probs / sum(sum(rand_probs)))[0]
    print(
        "Log loss on Cross Validation Data using Random Model",
        log_loss(y_cv, cv_predicted_y, eps=1e-15),
    )

    # Test-Set error.
    # We create a output array that has exactly same as the test data
    test_predicted_y = np.zeros((test_data_len, 9))
    for i in range(test_data_len):
        rand_probs = np.random.rand(1, 9)
        test_predicted_y[i] = (rand_probs / sum(sum(rand_probs)))[0]
    print(
        "Log loss on Test Data using Random Model",
        log_loss(y_test, test_predicted_y, eps=1e-15),
    )

    predicted_y = np.argmax(test_predicted_y, axis=1)

    plot_confusion_matrix(y_test, predicted_y + 1)

    return predicted_y, y_test
Example #4
0
def text_only_model(result, y):
    ext_vectorizer = CountVectorizer(
        min_df=4
    )  # In Feature we choose only those words with greater then 3 times occurence
    x = text_vectorizer.fit_transform(Result['Text'])
    X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(Dataset,
                                                               y,
                                                               test_size=0.3)
    X_tr, X_test, y_tr, y_test = cross_validation.train_test_split(
        X_tr, y_tr, test_size=0.3)
    Dataset = normalize(x, axis=0)
    tunes_para = [10**x for x in range(-5, 1)]
    cv_array_loss = []
    # want to tune for alpha in these code
    for i in tunes_para:
        print("for alpha =", i)
        clf = SGDClassifier(class_weight='balanced',
                            alpha=i,
                            penalty='l2',
                            loss='log',
                            random_state=42)
        clf.fit(X_tr, y_tr)
        clf2 = CalibratedClassifierCV(clf, method="sigmoid")
        clf2.fit(X_tr, y_tr)
        clf2_probs = clf2.predict_proba(X_test)
        cv_array_loss.append(
            log_loss(y_test, clf2_probs, labels=clf.classes_, eps=1e-15))
        # to avoid rounding error while multiplying probabilites we use log-probability estimates
        print("Log Loss :", log_loss(y_test, clf2_probs))
Example #5
0
def train_boost(booster,
                seed,
                oversampling=-1.0,
                use_tfidf=False,
                enable_cv=False,
                use_alldata=False,
                num_trees=-1):
    train, y, features = prepare_train()
    if use_tfidf:
        print 'Using raw tf-idf sparse matrix ... '
        features = 'auto'
        train_sparse = sparse.csr_matrix(train.values)
        # tfidf_sparse = load_sparse_csr('tfidf_stem_train.npz')
        bm25_sparse = load_sparse_csr('bm25_train.npz')
        # bm25_sparse = bm25_sparse[404290 - 50000:, :]
        # train = sparse.hstack([train_sparse, tfidf_sparse])
        # common_words = load_sparse_csr('train_tfidf_commonwords.npz')
        # symmdif = load_sparse_csr('train_tfidf_symmdiff.npz')
        train = sparse.hstack([train_sparse, bm25_sparse])
        del train_sparse, bm25_sparse
        print 'Train shape: ', train.shape

    if enable_cv:
        train, y = shuffle(train, y)
        booster.cv(train, y)
        exit()

    if use_alldata:
        print 'Using all data to fit classifier ... '
        assert num_trees > 0
        results = booster.fit_all(train, y, num_trees, features)
    else:
        print 'Using train/dev split to fit classifier ... '
        X_train, X_eval, y_train, y_eval = train_test_split(train,
                                                            y,
                                                            stratify=y,
                                                            test_size=0.20,
                                                            random_state=seed)

        if oversampling > 0:
            print 'Oversampling X_train, X_eval datasets ... '
            X_train, y_train = oversample_sparse(X_train,
                                                 y_train,
                                                 p=oversampling)
            X_eval, y_eval = oversample_sparse(X_eval, y_eval, p=oversampling)

        results = booster.fit(X_train, X_eval, y_train, y_eval, features)
        y_pred = booster.predict(X_eval)
        print log_loss(y_eval, y_pred)
        print y_pred

    train = None
    y = None
    del train
    del y

    return results
Example #6
0
def logreg(train,test,cv):
	tunes_para=[10 ** x for x in range(-5, 1)]
	cv_array_loss=[]
	# want to tune for alpha in these code
	for i in tunes_para:
	    print("for alpha =", i)
	    clf = SGDClassifier(class_weight='balanced',alpha=i,penalty='l2',loss='log',random_state=42)
	    clf.fit(train,y_tr)
	    clf2 = CalibratedClassifierCV(clf, method="sigmoid")
	    clf2.fit(train,y_tr)
	    clf2_probs = clf2.predict_proba(cv)
	    cv_array_loss.append(log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15))
	    # to avoid rounding error while multiplying probabilites we use log-probability estimates
	    print("Log Loss :",log_loss(y_cv, clf2_probs)) 
Example #7
0
def get_sgd_lr_model_cross_grid_search():
    ###logistic regression with hyper parameter tuning
    alpha = [10 ** x for x in range(-5, 6)]  # hyperparam for SGD classifier.
    f1_score_error_array = []
    for i in alpha:
        clf = SGDClassifier(alpha=i, loss='log', penalty='l2', random_state=42)
        lr_clf = OneVsRestClassifier(clf)
        lr_clf.fit(xtrain_tfidf, ytrain)
        y_pred_new = lr_clf.predict(xval_tfidf)
        f1_score_error_array.append(f1_score(yval, y_pred_new, average="micro"))
        print('For values of alpha = ', i, "The f1_score is:",
              f1_score(yval, y_pred_new, average="micro"))
        print('For values of alpha = ', i, "The log-loss is:",
              log_loss(yval, y_pred_new, eps=1e-15))

    fig, ax = plt.subplots()
    ax.plot(alpha, f1_score_error_array, c='g')
    for i, txt in enumerate(np.round(f1_score_error_array, 3)):
        ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], f1_score_error_array[i]))
    plt.grid()
    plt.title("Cross Validation Error for each alpha")
    plt.xlabel("Alpha i's")
    plt.ylabel("Error measure")
    plt.show()

    best_alpha = int(np.argmax(f1_score_error_array))
    clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
    lr_clf = OneVsRestClassifier(clf)
    lr_clf.fit(xtrain_tfidf, ytrain)
    y_pred_new = lr_clf.predict(xval_tfidf)
    final_f1_score = f1_score_error_array.append(f1_score(yval, y_pred_new, average="micro"))
    print('f1_score from SGDClassifier model after CV grid search is : {}'.format(final_f1_score))
    return lr_clf, final_f1_score
Example #8
0
def get_scores(y_true,y_pred):
    brier_score = brier_score_loss(y_true,y_pred)
    log_score = log_loss(y_true,y_pred)
    roc_score = roc_auc_score(y_true, y_pred)
    pr_score = average_precision_score(y_true,y_pred)
    r2score = r2_score(y_true,y_pred)
    return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
Example #9
0
def get_scores(shots):
    y_true = [shot.result for shot in shots]
    y_pred = [shot.pred for shot in shots]
    brier_score = brier_score_loss(y_true,y_pred)
    log_score = log_loss(y_true,y_pred)
    roc_score = roc_auc_score(y_true, y_pred)
    pr_score = average_precision_score(y_true,y_pred)
    r2score = r2_score(y_true,y_pred)
    return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
def predict_and_plot_confusion_matrix(train_x, train_y, test_x, test_y, clf):
    clf.fit(train_x, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x, train_y)
    pred_y = sig_clf.predict(test_x)

    print("log loss :", log_loss(test_y, sig_clf.predict_proba(test_x)))
    print("Number of mis-classified points :",
          np.count_nonzero((pred_y - test_y)) / test_y.shape[0])
    plot_confusion_matrix(test_y, pred_y)
Example #11
0
def predict_and_plot_confusion_matrix(train_x, train_y, test_x, test_y, clf):
    clf.fit(train_x, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x, train_y)
    pred_y = sig_clf.predict(test_x)

    # for calculating log_loss we willl provide the array of probabilities belongs to each class
    print("Log loss :", log_loss(test_y, sig_clf.predict_proba(test_x)))
    # calculating the number of data points that are misclassified
    print("Number of mis-classified points :",
          np.count_nonzero((pred_y - test_y)) / test_y.shape[0])
    plot_confusion_matrix(test_y, pred_y)
Example #12
0
def variation_only_model(X_tr, X_cv, y_tr, y_cv, X_test, y_test):
    #Let us do one hot convert of this(for test, train, cv)
    gene_vectorizer = CountVectorizer()
    train_Variation_feature_onehotCoding = gene_vectorizer.fit_transform(
        X_tr['Variation'])
    test_Variation_feature_onehotCoding = gene_vectorizer.transform(
        X_test['Variation'])
    cv_Variation_feature_onehotCoding = gene_vectorizer.transform(
        X_cv['Variation'])
    tunes_para = [10**x for x in range(-5, 1)]
    cv_array_loss = []
    # want to tune for alpha in these code
    for i in tunes_para:
        print("for alpha =", i)
        clf = SGDClassifier(class_weight='balanced',
                            alpha=i,
                            penalty='l2',
                            loss='log',
                            random_state=42)
        clf.fit(train_Variation_feature_onehotCoding, y_tr)
        clf2 = CalibratedClassifierCV(clf, method="sigmoid")
        clf2.fit(train_Variation_feature_onehotCoding, y_tr)
        clf2_probs = clf2.predict_proba(cv_Variation_feature_onehotCoding)
        cv_array_loss.append(
            log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15))
        # to avoid rounding error while multiplying probabilites we use log-probability estimates
        print("Log Loss :", log_loss(y_cv, clf2_probs))
    clf = SGDClassifier(class_weight='balanced',
                        alpha=0.001,
                        penalty='l2',
                        loss='log',
                        random_state=42)
    clf.fit(train_Variation_feature_onehotCoding, y_tr)
    clf2 = CalibratedClassifierCV(clf, method="sigmoid")
    clf2.fit(train_Variation_feature_onehotCoding, y_tr)
    clf2_probs = clf2.predict_proba(cv_Variation_feature_onehotCoding)
    cv_array_loss.append(
        log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :", log_loss(y_cv, clf2_probs))
Example #13
0
def evaluate_features(X,
                      y,
                      X_test,
                      y_test,
                      clf=None,
                      kfold=StratifiedKFold(n_splits=3)):
    """Can be used to evaluate features on training data and testing data;
    also compare model performance when specifying clf"""
    if clf is None:
        clf = RandomForestClassifier(n_estimators=400,
                                     random_state=5,
                                     n_jobs=-1)

    probas = cross_val_predict(
        clf,
        X,
        y,
        cv=kfold,
        n_jobs=-1,
        method="predict_proba",
        verbose=2,
    )
    pred_indices = np.argmax(probas, axis=1)
    classes = np.unique(y)
    preds = classes[pred_indices]

    print("Cross validation on training data: ")
    print("Log loss: {}".format(log_loss(y, probas)))
    print("Accuracy: {}".format(balanced_accuracy_score(y, preds)))
    print("MCC: {}".format(f1_score(y, preds, average="micro")))

    print("Validation on testing data: ")
    clf.fit(X, y)
    ytest = clf.predict(X_test)
    yprobas_test = clf.predict_proba(X_test)
    print("Log loss: {}".format(log_loss(y_test, yprobas_test)))
    print("Accuracy: {}".format(balanced_accuracy_score(y_test, ytest)))
    print("MCC: {}".format(f1_score(y_test, ytest, average="micro")))
Example #14
0
def mea_metrics_calc(num, model, train, test, target, target_test):
    """ For the calculation and storage of accuracy and log loss """

    global mea_all

    ytrain = model.predict(train)
    yprobas = model.predict_proba(train)
    ytest = model.predict(test)
    yprobas_test = model.predict_proba(test)
    print("target = ", target[:5])
    print("ytrain = ", ytrain[:5])
    print("target_test =", target_test[:5])
    print("ytest =", ytest[:5])

    num_mea = 0
    for x in metrics_now:
        if x == 1:
            # log loss
            mea_train = log_loss(target, yprobas)
            mea_test = log_loss(target_test, yprobas_test)
        elif x == 2:
            # accuracy
            mea_train = round(balanced_accuracy_score(target, ytrain) * 100, 3)
            mea_test = round(
                balanced_accuracy_score(target_test, ytest) * 100, 3)
        elif x == 3:
            # f1 score
            mea_train = f1_score(target, ytrain, average="micro")
            mea_test = f1_score(target_test, ytest, average="micro")

        print("Measure of", metrics_all[x], "for train =", mea_train)
        print("Measure of", metrics_all[x], "for test =", mea_test)

        mea_all[num_mea].append(mea_train)  # train
        mea_all[num_mea + 1].append(mea_test)  # test
        num_mea += 2

    return plot_confusion_matrix(model, target_test, ytest)
Example #15
0
    def find_exclude(self, n_splits=5):
        if not self.model_dict or not self.data_dict:
            print('Stoped: no models or data')
            return None

        for c in self.countries:
            self.model_dict[c].load_data(data=self.data_dict[c],
                                         balance=self.balances[c])
            exclude_list = []
            finish = False
            logloss_dict = {}
            while not finish:
                self.model_dict[c].set_exclude_list(exclude_list)
                self.model_dict[c].train()
                exclude_list_prev = exclude_list.copy()
                columns = [
                    x for x in self.model_dict[c].get_train().columns
                    if x not in exclude_list_prev
                ]
                exclude_list = [
                    x for (x, y) in zip(
                        columns, self.model_dict[c].get_feature_importances())
                    if y == 0
                ]
                if not exclude_list:
                    finish = True
                exclude_list = exclude_list_prev + exclude_list

                logloss_iter = []
                splits = self.model_dict[c].data.get_train_valid(
                    n_splits=n_splits, balance=self.balances[c])

                for i in range(0, n_splits):
                    self.model_dict[c].set_random_seed(i)
                    train, valid = splits[i]
                    self.model_dict[c].set_exclude_list(exclude_list)
                    self.model_dict[c].train(train[0], train[1])
                    pred = self.model_dict[c].predict(valid[0])
                    logloss_iter.append(
                        log_loss(valid[1].astype(int), pred['poor']))
                logloss = np.mean(logloss_iter)
                logloss_dict[logloss] = exclude_list
                print('loglos: {0} exclude length: {1}'.format(
                    logloss, len(exclude_list)))
            self.exclude_dict[c] = logloss_dict[np.min(
                list(logloss_dict.keys()))]
            print('Country: {0} exclude length: {1}'.format(
                c, len(self.exclude_dict.get(c))))

        return logloss_dict
Example #16
0
def Baseline_model(X_tr, X_cv, y_tr, y_cv, X_test, y_test):
    # We need some a list of size 9 which will sum to 1
    test_data_len = X_test.shape[0]
    cv_data_len = X_cv.shape[0]

    # we create a output array that has exactly same size as the CV data
    cv_predicted = np.zeros((cv_data_len, 9))
    j = 1
    for i in range(cv_data_len):
        rand_probs = np.random.rand(1, 9)  # array of size 9 sum to 1.
        cv_predicted[i] = ((rand_probs /
                            sum(sum(rand_probs))))[0]  # Lie between 0 to 1
    print("Log Loss at cross-validation step is %d",
          log_loss(y_cv, cv_predicted, eps=1e-15))
    # we create a output array that has exactly same size as the CV data
    test_predicted = np.zeros((test_data_len, 9))
    j = 1
    for i in range(test_data_len):
        rand_probs = np.random.rand(1, 9)  # array of size 9 sum to 1.
        test_predicted[i] = ((rand_probs /
                              sum(sum(rand_probs))))[0]  # Lie between 0 to 1
    print("Log Loss at test step is %d",
          log_loss(y_test, test_predicted, eps=1e-15))
def evaluate(config, model, valid_loader, test=False):
    model.eval()
    loss_fn = nn.CrossEntropyLoss(
        weight=torch.tensor(config.class_wts, dtype=torch.float).to(config.device)
    )
    total_ts_labels = np.array([], dtype=int)
    total_ts_preds = np.empty(shape=(0, 9), dtype=int)

    avg_ts_loss = tnt.meter.AverageValueMeter()
    with torch.no_grad():
        for i, batch in enumerate(valid_loader):
            batch = [r.to(config.device) for r in batch]
            x_batch, y_batch = batch
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)
            avg_ts_loss.add(loss.item())
            y_pred = F.softmax(y_pred, dim=1).detach().cpu().numpy()  # (batch_size, 9)
            total_ts_labels = np.append(
                total_ts_labels, y_batch.cpu().numpy()
            )  # (batch_size, 1)
            total_ts_preds = np.append(
                total_ts_preds, y_pred, axis=0
            )  # (batch_size, 9)

    encoded_ts_labels = pd.get_dummies(total_ts_labels)  # (N, 9)

    # Accuracy
    val_acc = balanced_accuracy_score(
        total_ts_labels, total_ts_preds.argmax(axis=1)
    )  # argmax -> numeric labels (batch_size, 1)
    # log loss
    val_log_loss = log_loss(encoded_ts_labels, total_ts_preds)
    # f1 score
    val_f1score = f1_score(
        total_ts_labels, total_ts_preds.argmax(axis=1), average="micro"
    )

    if test == True:
        return (
            total_ts_labels,
            total_ts_preds,
            encoded_ts_labels,
            avg_ts_loss.value()[0],
            val_acc,
            val_log_loss,
            val_f1score,
        )
    else:
        return avg_ts_loss.value()[0], val_acc, val_log_loss, val_f1score
Example #18
0
def do_model(model, X_train, y_train, X_test, y_test, class_weight=None):
    if class_weight == 'balanced':
        sample_weight = unbalanced_sample_weight(y_train)
    else:
        sample_weight = None
    model.fit(X_train, y_train, sample_weight=sample_weight)

    predict_proba = model.predict_proba(X_test)
    proba = [x[1] for x in predict_proba]
    if class_weight == 'balanced':
        sample_weight = unbalanced_sample_weight(y_test)
    else:
        sample_weight = None
    loss = log_loss(y_test, proba, sample_weight=sample_weight)
    logger.debug('loss is %f', loss)

    return model, loss
Example #19
0
def print_metrics(y_true, y_pred):
    print('auc:', roc_auc_score(y_true, y_pred))
    print('accuracy:', classification.accuracy_score(y_true, y_pred))

    confusion_matrix = classification.confusion_matrix(y_true, y_pred)
    # print('confusion matrix:')
    # print('report:', classification.classification_report(y_true, y_pred))
    tn, fp, fn, tp = confusion_matrix.ravel()
    sensitivity = tp / (tp + fn)
    print('sensitivity: {}'.format(sensitivity))
    specificity = tn / (tn + fp)
    print('specificity: {}'.format(specificity))
    print('precision: {}'.format(tp / (tp + fp)))
    total_acc = (tp + tn) / (tp + tn + fp + fn)
    random_acc = (((tn + fp) * (tn + fn) + (fn + tp) * (fp + tp)) /
                  (tp + tn + fp + fn)**2)
    kappa = (total_acc - random_acc) / (1 - random_acc)
    print('Cohen\'s kappa: {}'.format(kappa))
    youdens = sensitivity - (1 - specificity)
    print('Youden\'s index: {}'.format(youdens))
    print('log loss:', classification.log_loss(y_true, y_pred))
]]

from scipy import sparse

train_q = sparse.hstack([train_vec_1, df3])
train_y = df['is_duplicate']

#========================================

from tqdm.auto import tqdm

from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.linear_model import SGDClassifier
#best_alpha = np.argmin(log_error_array)
clf = SGDClassifier(alpha=1, penalty='l2', loss='log', random_state=42)
clf.fit(train_q, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_q, train_y)

predict_y = sig_clf.predict_proba(train_q)
print('For values of best alpha = ', 0.1, "The train log loss is:",
      log_loss(train_y, predict_y, labels=clf.classes_, eps=1e-15))
predicted_y = np.argmax(predict_y, axis=1)
print(accuracy_score(train_y, predicted_y))
#=================================================

import joblib
joblib.dump(tf_idf_vect, 'tf_idf_vect.pkl')
joblib.dump(sig_clf, 'sig_clf.pkl')
Example #21
0
                                   optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f"
      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f"
      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)"
      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
print("Log-loss: %.3f (initial) %.3f (optimized)"
      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))


# Plot posteriors
plt.figure(0)
plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data",
            edgecolors=(0, 0, 0))
plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data",
            edgecolors=(0, 0, 0))
X_ = np.linspace(0, 5, 100)
plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
         label="Initial kernel: %s" % gp_fix.kernel_)
plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
         label="Optimized kernel: %s" % gp_opt.kernel_)
plt.xlabel("Feature")
Example #22
0
    for clf_id, clf in enumerate(base_classifiers):

        print "Training base classifier #{0} -- {1}".format(
            clf_id, clf.__class__.__name__)

        dataset_blend_test_j = np.zeros((XTest.shape[0], N_FOLDS))
        for fold_id, (train_indexes, predict_indexes) in enumerate(splits):
            print "Fold", fold_id

            # Fit on train part
            clf.fit(XTrain[train_indexes], YTrain[train_indexes])

            # Predict on the rest of data
            y_pred = clf.predict(XTrain[predict_indexes])
            df_blend_train[predict_indexes, clf_id] = y_pred
            lloss = log_loss(YTrain[predict_indexes], y_pred)
            oof_loglosses[clf_id, fold_id] = lloss
            print 'LogLoss: ', lloss

            # Predict on entire test set
            dataset_blend_test_j[:, fold_id] = clf.predict(XTest)

        # Average predictions for test set
        df_blend_test[:, clf_id] = dataset_blend_test_j.mean(1)

    print "Out of fold logloss-es:\n", oof_loglosses

    np.save('lgbstacking_train_82features.csv', df_blend_train)
    np.save('lgbstacking_test_82features.csv', df_blend_test)

    # print "\nBlending ..."
Example #23
0
def report_log_loss(train_x, train_y, test_x, test_y, clf):
    clf.fit(train_x, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x, train_y)
    sig_clf_probs = sig_clf.predict_proba(test_x)
    return log_loss(test_y, sig_clf_probs, eps=1e-15)
# class_weight=None, warm_start=False, average=False, n_iter=None)

# some of methods
# fit(X, y[, coef_init, intercept_init, …])	Fit linear model with Stochastic Gradient Descent.
# predict(X)	Predict class labels for samples in X.

###############################################################################
log_error_array = []
for i in alpha:
    clf = SGDClassifier(alpha=i, penalty='l1', loss='hinge', random_state=42)
    clf.fit(X_train, y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_train, y_train)
    predict_y = sig_clf.predict_proba(X_test)
    log_error_array.append(
        log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
    print('For values of alpha = ', i, "The log loss is:",
          log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))

###############################################################################
fig, ax = plt.subplots()
ax.plot(alpha, log_error_array, c='g')
for i, txt in enumerate(np.round(log_error_array, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], log_error_array[i]))

plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
Example #25
0
    cv_x_responseCoding.shape)

# ## Naive Bayes

#  http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = MultinomialNB(alpha=i)
    clf.fit(train_x_onehotCoding, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x_onehotCoding, train_y)
    sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
    cv_log_error_array.append(
        log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :", log_loss(cv_y, sig_clf_probs))

fig, ax = plt.subplots()
ax.plot(np.log10(alpha), cv_log_error_array, c='g')
for i, txt in enumerate(np.round(cv_log_error_array, 3)):
    ax.annotate((alpha[i], str(txt)),
                (np.log10(alpha[i]), cv_log_error_array[i]))
plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
alpha = [0.000001, 0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000]
cv_log_error_array = []
for i in alpha:
    print("for alpha = ", i)
    clf = SGDClassifier(class_weight='balanced',
                        alpha=i,
                        penalty='l2',
                        loss='hinge',
                        random_state=42)
    clf.fit(train_df, y_train)
    sig_clf = CalibratedClassifierCV(clf, method='sigmoid')
    sig_clf.fit(train_df, y_train)
    sig_clf_probs = sig_clf.predict_proba(cv_df)
    cv_log_error_array.append(
        log_loss(y_cv, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    print("Log Loss : ", log_loss(y_cv, sig_clf_probs))

best_alpha = np.argmin(cv_log_error_array)
print("The best alpha : ", alpha[best_alpha])
clf = SGDClassifier(class_weight='balanced',
                    alpha=alpha[best_alpha],
                    penalty='l2',
                    loss='hinge',
                    random_state=42)
clf.fit(train_df, y_train)
sig_clf = CalibratedClassifierCV(clf, method='sigmoid')
sig_clf.fit(train_df, y_train)
sig_clf_probs = sig_clf.predict_proba(train_df)
print("For best alpha, the training log Loss : ",
      log_loss(y_train, sig_clf_probs))
Example #27
0
print("One hot encoding features :")
print("(number of data points * number of features) in train data = ", train_x_onehotCoding.shape)
print("(number of data points * number of features) in test data = ", test_x_onehotCoding.shape)
print("(number of data points * number of features) in cross validation data =", cv_x_onehotCoding.shape)

alpha=[0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
cv_log_error_array=[]
for i in alpha:
    print("for alpha =", i)
    clf=MultinomialNB(alpha=i)
    clf.fit(train_x_onehotCoding,train_y)
    sig_clf=CalibratedClassifierCV(clf,method="sigmoid")
    sig_clf.fit(train_x_onehotCoding,train_y)
    sig_clf_probs=sig_clf.predict_proba(cv_x_onehotCoding)
    cv_log_error_array.append(log_loss(cv_y,sig_clf_probs,labels=clf.classes_,eps=1e-15))
    print("Log Loss :",log_loss(cv_y,sig_clf_probs))
    
fig,ax=plt.subplots()
ax.plot(np.log10(alpha),cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error_array[i]))
plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

best_alpha=np.argmin(cv_log_error_array)
clf=MultinomialNB(alpha=alpha[best_alpha])
Example #28
0
def logloss(act, pred, class_weight=None):
    if class_weight == 'balanced':
        sample_weight = unbalanced_sample_weight(act)
    else:
        sample_weight = None
    return log_loss(act, pred, sample_weight=sample_weight)
plt.ylabel('Number of data points')
plt.title('Distribution')
plt.grid()
plt.show()
"""
#check distribution in all sets i.e train ,test,cv and plot graph and find percentage
test_data_len = test_df.shape[0]
cv_data_len = cv_df.shape[0]
#create random model for apllying log loss

cv_predicted_y = np.zeros((cv_data_len, 9))
for i in range(cv_data_len):
    rand_probs = np.random.rand(1, 9)
    cv_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0])
print("log loss on cv data using random model",
      log_loss(y_cv, cv_predicted_y, eps=1e-15))

test_predicted_y = np.zeros((test_data_len, 9))
for i in range(test_data_len):
    rand_probs = np.random.rand(1, 9)
    test_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0])
print("log loss on test data using random model",
      log_loss(y_test, test_predicted_y, eps=1e-15))

predicted_y = np.argmax(test_predicted_y, axis=1)
print(predicted_y)

predicted_y = predicted_y + 1

C = confusion_matrix(y_test, predicted_y)
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
            inter_cat = df_full['code'].astype('category')
            inter_cat = pd.get_dummies(inter_cat)
            p1 = pd.DataFrame(inter_cat.iloc[split[0]], index=X.index.values)
            result = pd.concat([X, p1], axis=1, ignore_index=True)
            p2 = pd.DataFrame(inter_cat.iloc[split[1]],
                              index=X_test.index.values)
            result1 = pd.concat([X_test, p2], axis=1, ignore_index=True)

            # Train Logistic regression
            logreg = linear_model.LogisticRegression(C=1e5,
                                                     penalty='l1',
                                                     multi_class='ovr')
            logreg.fit(result, y)
            clf_probs = logreg.predict_proba(result1)
            print("Logistic score", logreg.score(result1, y_true),
                  log_loss(y_true, clf_probs))

            # add to test dataframes
            predicted = logreg.predict(result)
            df['logreg'] = predicted

            predicted = logreg.predict(result1)
            dft['logreg'] = predicted
            '''
            # SVM
            # train
            logreg = svm.LinearSVC()
            logreg.fit(result, y)
            print("SVM score", logreg.score(result1, y_true))

            # add to test dataframes
                                   optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f" %
      gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f" %
      gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)" %
      (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
       accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
print("Log-loss: %.3f (initial) %.3f (optimized)" %
      (log_loss(y[:train_size],
                gp_fix.predict_proba(X[:train_size])[:, 1]),
       log_loss(y[:train_size],
                gp_opt.predict_proba(X[:train_size])[:, 1])))

# Plot posteriors
plt.figure()
plt.scatter(X[:train_size, 0],
            y[:train_size],
            c='k',
            label="Train data",
            edgecolors=(0, 0, 0))
plt.scatter(X[train_size:, 0],
            y[train_size:],
            c='g',
            label="Test data",
            edgecolors=(0, 0, 0))
Example #32
0
def text_only_model(result,y):
	ext_vectorizer = CountVectorizer(min_df=4)# In Feature we choose only those words with greater then 3 times occurence
	x=text_vectorizer.fit_transform(Result['Text'])
	X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(Dataset, y, test_size=0.3)
	X_tr, X_test, y_tr, y_test = cross_validation.train_test_split(X_tr, y_tr, test_size=0.3)
	Dataset=normalize(x,axis=0)
	tunes_para=[10 ** x for x in range(-5, 1)]
	cv_array_loss=[]
# want to tune for alpha in these code
	for i in tunes_para:
    	print("for alpha =", i)
    	clf = SGDClassifier(class_weight='balanced',alpha=i,penalty='l2',loss='log',random_state=42)
    	clf.fit(X_tr,y_tr)
    	clf2 = CalibratedClassifierCV(clf, method="sigmoid")
    	clf2.fit(X_tr,y_tr)
    	clf2_probs = clf2.predict_proba(X_test)
    	cv_array_loss.append(log_loss(y_test, clf2_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    	print("Log Loss :",log_loss(y_test, clf2_probs)) 
################################################################	
def  Combine_features(X_tr, X_cv, y_tr, y_cv,X_tr, X_test, y_tr, y_test,result):
	#Let us do one hot convert of this(for test, train, cv)
	gene_vectorizer = CountVectorizer()
	train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(X_tr['Gene'])
	test_gene_feature_onehotCoding = gene_vectorizer.transform(X_test['Gene'])
	cv_gene_feature_onehotCoding = gene_vectorizer.transform(X_cv['Gene'])
	#Let us do one hot convert of this(for test, train, cv)
	gene_vectorizer = CountVectorizer()
	train_Variation_feature_onehotCoding = gene_vectorizer.fit_transform(X_tr['Variation'])
	test_Variation_feature_onehotCoding = gene_vectorizer.transform(X_test['Variation'])
	cv_Variation_feature_onehotCoding = gene_vectorizer.transform(X_cv['Variation'])



	text_vectorizer = CountVectorizer(min_df=4)# In Feature we choose only those words with greater then 3 times occurence
	x=text_vectorizer.fit_transform(Result['Text'])


	Dataset=normalize(x,axis=0)
	X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(Dataset, y, test_size=0.3)
	X_tr, X_test, y_tr, y_test = cross_validation.train_test_split(X_tr, y_tr, test_size=0.3)
	# Lets print the shape of all the three Features
	print(train_gene_feature_onehotCoding.shape)
	print(test_gene_feature_onehotCoding.shape)
	print(cv_gene_feature_onehotCoding.shape)
	# Lets print the shape of all the three Features
	print(train_Variation_feature_onehotCoding.shape)
	print(test_Variation_feature_onehotCoding.shape)
	print(cv_Variation_feature_onehotCoding.shape)

	print(X_tr.shape)
	print(X_test.shape)
	print(X_cv.shape)

	X_tr=pd.DataFrame(X_tr.todense())
	X_test=pd.DataFrame(X_test.todense())
	X_cv=pd.DataFrame(X_cv.todense())
	train_Variation_feature_onehotCoding=pd.DataFrame(train_Variation_feature_onehotCoding.todense())
	test_Variation_feature_onehotCoding=pd.DataFrame(test_Variation_feature_onehotCoding.todense())
	cv_Variation_feature_onehotCoding=pd.DataFrame(cv_Variation_feature_onehotCoding.todense())
	train_gene_feature_onehotCoding=pd.DataFrame(train_gene_feature_onehotCoding.todense())
	test_gene_feature_onehotCoding=pd.DataFrame(test_gene_feature_onehotCoding.todense())
	cv_gene_feature_onehotCoding=pd.DataFrame(cv_gene_feature_onehotCoding.todense())
	train = X_tr.join(train_gene_feature_onehotCoding,lsuffix="_X_tr",rsuffix="_train_gene_feature_onehotCoding")
	train = train.join(train_Variation_feature_onehotCoding,lsuffix="_train",rsuffix="_train_Variation_feature_onehotCoding")
	print(train.shape)
	test = X_test.join(test_gene_feature_onehotCoding,lsuffix="_X_test",rsuffix="_test_gene_feature_onehotCoding")
	test = test.join(test_Variation_feature_onehotCoding,lsuffix="_test",rsuffix="_test_Variation_feature_onehotCoding")
	print(test.shape)
	cv = X_cv.join(test_gene_feature_onehotCoding,lsuffix="_X_cv",rsuffix="_cv_gene_feature_onehotCoding")
	cv = cv.join(test_Variation_feature_onehotCoding,lsuffix="_cv",rsuffix="_cv_Variation_feature_onehotCoding")
	print(cv.shape)

     # Before appliing model lets remove all nan value
	features=train.columns
	pd.options.mode.chained_assignment = None 	
	for i in features:
	    print("Done")  
	    train[i].fillna(0, inplace=True)


	features=test.columns
	pd.options.mode.chained_assignment = None 
	for i in features:
	    test[i].fillna(0, inplace=True)

	features=cv.columns
	pd.options.mode.chained_assignment = None 
	for i in features:
	    cv[i].fillna(0, inplace=True)
	return train,test,cv    
##########################################################
def logreg(train,test,cv):
	tunes_para=[10 ** x for x in range(-5, 1)]
	cv_array_loss=[]
	# want to tune for alpha in these code
	for i in tunes_para:
	    print("for alpha =", i)
	    clf = SGDClassifier(class_weight='balanced',alpha=i,penalty='l2',loss='log',random_state=42)
	    clf.fit(train,y_tr)
	    clf2 = CalibratedClassifierCV(clf, method="sigmoid")
	    clf2.fit(train,y_tr)
	    clf2_probs = clf2.predict_proba(cv)
	    cv_array_loss.append(log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15))
	    # to avoid rounding error while multiplying probabilites we use log-probability estimates
	    print("Log Loss :",log_loss(y_cv, clf2_probs)) 
###########################################################    





















################################################################	
if __name__=="__main__":
	main()












nltk.download('stopwords')
stop=set(stopwords.words('english')) #function to clean the word
# Lets see all the stop words
print(stop)
# loading stop words from nltk library

stop_words = set(stopwords.words('english'))

def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        # replace every special char with space
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text))
        # replace multiple spaces with single space
        total_text = re.sub('\s+',' ', str(total_text))
        # converting all the chars into lower-case.
        total_text = total_text.lower()
        
        for word in total_text.split():
        # if the word is a not a stop word then retain that word from the data
            if not word in stop_words:
                string += word + " "
        
        data_text[column][index] = string
#text processing stage.
start_time = time.clock()
for index, row in data_text.iterrows():
    nlp_preprocessing(row['Text'], index, 'Text')
print('Time took for preprocessing the text :',time.clock() - start_time, "seconds")
#merging both gene_variations and text data based on ID
data = pd.read_csv("training_variants.csv")
result = pd.merge(data, data_text,on='ID', how='left')
result.head()
# Lets split the data into train and test
Result=result
y=result["Class"].values

X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(result, y, test_size=0.3)
X_tr, X_test, y_tr, y_test = cross_validation.train_test_split(X_tr, y_tr, test_size=0.3)

#Let us do one hot convert of this(for test, train, cv)
gene_vectorizer = CountVectorizer()
train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(X_tr['Gene'])
test_gene_feature_onehotCoding = gene_vectorizer.transform(X_test['Gene'])
cv_gene_feature_onehotCoding = gene_vectorizer.transform(X_cv['Gene'])
#Let us do one hot convert of this(for test, train, cv)
gene_vectorizer = CountVectorizer()
train_Variation_feature_onehotCoding = gene_vectorizer.fit_transform(X_tr['Variation'])
test_Variation_feature_onehotCoding = gene_vectorizer.transform(X_test['Variation'])
cv_Variation_feature_onehotCoding = gene_vectorizer.transform(X_cv['Variation'])



text_vectorizer = CountVectorizer(min_df=4)# In Feature we choose only those words with greater then 3 times occurence
x=text_vectorizer.fit_transform(Result['Text'])


Dataset=normalize(x,axis=0)
X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(Dataset, y, test_size=0.3)
X_tr, X_test, y_tr, y_test = cross_validation.train_test_split(X_tr, y_tr, test_size=0.3)
# Lets print the shape of all the three Features
print(train_gene_feature_onehotCoding.shape)
print(test_gene_feature_onehotCoding.shape)
print(cv_gene_feature_onehotCoding.shape)
# Lets print the shape of all the three Features
print(train_Variation_feature_onehotCoding.shape)
print(test_Variation_feature_onehotCoding.shape)
print(cv_Variation_feature_onehotCoding.shape)

print(X_tr.shape)
print(X_test.shape)
print(X_cv.shape)

X_tr=pd.DataFrame(X_tr.todense())
X_test=pd.DataFrame(X_test.todense())
X_cv=pd.DataFrame(X_cv.todense())
train_Variation_feature_onehotCoding=pd.DataFrame(train_Variation_feature_onehotCoding.todense())
test_Variation_feature_onehotCoding=pd.DataFrame(test_Variation_feature_onehotCoding.todense())
cv_Variation_feature_onehotCoding=pd.DataFrame(cv_Variation_feature_onehotCoding.todense())
train_gene_feature_onehotCoding=pd.DataFrame(train_gene_feature_onehotCoding.todense())
test_gene_feature_onehotCoding=pd.DataFrame(test_gene_feature_onehotCoding.todense())
cv_gene_feature_onehotCoding=pd.DataFrame(cv_gene_feature_onehotCoding.todense())
train = X_tr.join(train_gene_feature_onehotCoding,lsuffix="_X_tr",rsuffix="_train_gene_feature_onehotCoding")
train = train.join(train_Variation_feature_onehotCoding,lsuffix="_train",rsuffix="_train_Variation_feature_onehotCoding")
print(train.shape)
test = X_test.join(test_gene_feature_onehotCoding,lsuffix="_X_test",rsuffix="_test_gene_feature_onehotCoding")
test = test.join(test_Variation_feature_onehotCoding,lsuffix="_test",rsuffix="_test_Variation_feature_onehotCoding")
print(test.shape)
cv = X_cv.join(test_gene_feature_onehotCoding,lsuffix="_X_cv",rsuffix="_cv_gene_feature_onehotCoding")
cv = cv.join(test_Variation_feature_onehotCoding,lsuffix="_cv",rsuffix="_cv_Variation_feature_onehotCoding")
print(cv.shape)


features=train.columns
pd.options.mode.chained_assignment = None 	
for i in features:
    print("Done")  
    train[i].fillna(0, inplace=True)


features=test.columns
pd.options.mode.chained_assignment = None 
for i in features:
    test[i].fillna(0, inplace=True)

features=cv.columns
pd.options.mode.chained_assignment = None 
for i in features:
    cv[i].fillna(0, inplace=True)

print(cv.shape)
train.to_csv("rtrain.csv")
test.to_csv("rtest.csv")
cv.to_csv("rcv.csv")

tunes_para=[10 ** x for x in range(-5, 1)]
cv_array_loss=[]
# want to tune for alpha in these code
for i in tunes_para:
    print("for alpha =", i)
    clf = SGDClassifier(class_weight='balanced',alpha=i,penalty='l2',loss='log',random_state=42)
    clf.fit(train,y_tr)
    clf2 = CalibratedClassifierCV(clf, method="sigmoid")
    clf2.fit(train,y_tr)
    clf2_probs = clf2.predict_proba(cv)
    cv_array_loss.append(log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(y_cv, clf2_probs)) 
Example #33
0
from sklearn.metrics.classification import (hamming_loss, log_loss)

# binary class
y_pred = [1, 2, 3, 4]
y_true = [1, 2, 3, 4]
y_true = [2, 2, 3, 4]
y_true = [5, 6, 7, 8]
hamming_loss(y_true, y_pred)
hamming_loss(list("ABFD"), list("ABCD"))

#multi class
hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))

y_true = [0, 0, 1, 1]
y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]]  # [Pr(0), Pr(1)]
log_loss(y_true, y_pred)
"""
Receiver operating characteristic (ROC) Curve

roc_curve?
roc_curve(y_true, y_score, pos_label=None, 
          sample_weight=None, drop_intermediate=True)
Note: this implementation is restricted to the binary classification task.

y_true : array, shape = [n_samples]
    True binary labels in range {0, 1} or {-1, 1}.  If labels are not
    binary, pos_label should be explicitly given.

y_score : array, shape = [n_samples]
    Target scores, can either be probability estimates of the positive
    class, confidence values, or non-thresholded measure of decisions
Example #34
0
def h2o_log_loss(y_actual, y_predict, eps=1e-15, normalize=True, sample_weight=None, y_type=None):
    """Log loss, aka logistic loss or cross-entropy loss.
    This is the loss function used in (multinomial) logistic regression
    and extensions of it such as neural networks, defined as the negative
    log-likelihood of the true labels given a probabilistic classifier's
    predictions. The log loss is only defined for two or more labels.
    For a single sample with true label yt in {0,1} and
    estimated probability yp that yt = 1, the log loss is

        -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))

    This method is adapted from the ``sklearn.metrics.classification.log_loss``
    function for use with ``H2OFrame``s in skutil.


    Parameters
    ----------
    y_actual : ``H2OFrame``, shape=(n_samples,)
        The one-dimensional ground truth

    y_predict : ``H2OFrame``, shape=(n_samples, [n_classes])
        The predicted labels. Can represent a matrix. If
        ``y_predict.shape = (n_samples,)`` the probabilities provided
        are assumed to be that of the positive class. The labels in
        ``y_predict`` are assumed to be ordered ordinally.

    eps : float, optional (default=1e-15)
        Log loss is undefined for p=0 or p=1, so probabilities are
        clipped to max(eps, min(1 - eps, p)).

    normalize : bool, optional (default=True)
        If true, return the mean loss per sample.
        Otherwise, return the sum of the per-sample losses.

    sample_weight : H2OFrame or float, optional (default=None)
        A frame of sample weights of matching dims with
        y_actual and y_predict.

    y_type : string, optional (default=None)
        The type of the column. If None, will be determined.


    Returns
    -------
    loss : float


    Notes
    -----
    The logarithm used is the natural logarithm (base-e).
    """
    # SKIP THESE FOR NOW, SINCE VALIDATED IN SKLEARN PORTION
    # y_type, y_actual, y_predict = _check_targets(y_actual, y_predict, y_type)
    # _err_for_continuous(y_type)  # this is restricted to classification tasks

    if sample_weight is not None:
        if isinstance(sample_weight, H2OFrame):
            _, _, sample_weight = _check_targets(y_actual, sample_weight, 'unknown')  #  we don't care about y_type here
            sample_weight = h2o_col_to_numpy(sample_weight)
        # else we just duck type it later

    # todo: do this better someday
    y_actual = h2o_col_to_numpy(y_actual)  # this is supposed to be a ONE-dim vector
    y_predict = y_predict.as_data_frame(use_pandas=True).as_matrix()  # this might be 2-dim

    # if it's a column, make it a vector.
    if len(y_predict.shape) == 2 and y_predict.shape[1] == 1:
        y_predict = y_predict.T[0]

    return log_loss(y_actual, y_predict, eps=eps,
                    normalize=normalize,
                    sample_weight=sample_weight)