y_train, y_test = y[train_index], y[test_index]

    X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train, y_train)
    clf_ridge.fit(X_train, y_train)
    clf_SGD.fit(X_train, y_train)
    clf_lSVC.fit(X_train, y_train)
    clf_SVC.fit(X_train, y_train)

    # get prediction for this fold run
    prob_mNB    = clf_mNB.predict_proba(X_test)
    prob_ridge  = clf_ridge.decision_function(X_test)
    prob_SGD    = clf_SGD.decision_function(X_test)
    prob_lSVC   = clf_lSVC.decision_function(X_test)
    prob_SVC    = clf_SVC.predict_proba(X_test)

    # add prob functions into the z 2d-array
    z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC)
    z = np.append(z, z_temp, axis=0)


# remove the first sub-1d-array of z, due to the creation with 0s
z = np.delete(z, 0, 0)
# the result of z is a 2d array with shape of (n_samples, n_categories)
# the elements are the sum of probabilities of classifiers on each (sample,category) pair
print z
print 'z shape:     ', z.shape

    y_train_train, y_train_test = y_train[train_index], y_train[test_index]

    # X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train_train, y_train_train)
    # clf_kNN.fit(X_train_train, y_train_train)
    clf_ridge.fit(X_train_train, y_train_train)
    clf_lSVC.fit(X_train_train, y_train_train)
    clf_SVC.fit(X_train_train, y_train_train)

    # get prediction for this fold run
    prob_mNB    = clf_mNB.predict_proba(X_train_test)
    # prob_kNN    = clf_kNN.predict_proba(X_train_test)
    prob_ridge  = clf_ridge.decision_function(X_train_test)
    prob_lSVC   = clf_lSVC.decision_function(X_train_test)
    prob_SVC    = clf_SVC.predict_proba(X_train_test)

    # update z array for each model
    # z_temp = prob_lSVC
    # z_temp = (prob_ridge + prob_lSVC)
    z_temp = (prob_mNB + prob_ridge + prob_lSVC + prob_SVC)
    z = np.append(z, z_temp, axis=0)


# remove the first sub-1d-array of z, due to the creation with 0s
z = np.delete(z, 0, 0)
# the result of z is a 2d array with shape of (n_samples, n_categories)
# the elements are the sum of probabilities of classifiers on each (sample,category) pair
# Possible preprocessing on z
# z = normalize(z, norm="l2")
Beispiel #3
0
    print doc
    for label in labels:
            # label[0]: score; label[1]: #
            print data_train.target_names[label[1]], label[0]
    print


#####################################
# decision_function and predict_proba
print clf_nb
pred_prob = clf_nb.predict_proba(X_new)
print pred_prob
print

print clf_lsvc
pred_decision = clf_lsvc.decision_function(X_new)
print pred_decision
print 

print clf_svc
# SVC should have the decision_function method, but got error:
# error - ValueError: setting an array element with a sequence
# pred_decision = clf_svc.decision_function(X_new)
pred_prob = clf_svc.predict_proba(X_new)
print pred_prob
print

print clf_sgd
pred_decision = clf_sgd.decision_function(X_new)
# pred_prob is only supported for binary classification!
# pred_prob = clf_sgd.predict_proba(X_new)
Beispiel #4
0
# split ~140k into ~100k training and ~40k test
ff_train, ff_val = split_dataframe(test_ff)

print("Training...")

t1 = time()

vectorizer = CountVectorizer()
train_counts = vectorizer.fit_transform(ff_train["TitlePlusBody"])
tfidf_transformer = TfidfTransformer(use_idf=False)

# 98190x285052
train_tfidf_table = tfidf_transformer.fit_transform(train_counts)

clf = LinearSVC().fit(train_tfidf_table, ff_train["OpenStatus"])

print("Testing...")

test_counts = vectorizer.transform(ff_val["TitlePlusBody"])
test_tfidf_table = tfidf_transformer.transform(test_counts)

predict = clf.predict(test_tfidf_table)
print("np.mean: %f" % (np.mean(predict == ff_val["OpenStatus"])))

linear_decisions = clf.decision_function(test_tfidf_table)
predicted_probs = (1 / (1 + np.exp(- linear_decisions))) ** 3.5
print("MCLL: %f" % (mcll(predicted_probs, ff_val["OpenStatus"].values)))

t2 = time()
print("done in %d seconds" % (t2 - t1))