auc_graphs = {}

best_score = 0.0
best_model = None
for i in range(1000):
    m = LinearModel.random(D)
    train_score = m.score(X_vali, y_vali)
    if train_score > best_score or best_model is None:
        best_score = train_score
        best_model = m
        print("rand[{}] = {:.3}".format(i, train_score))

print(["{:1.3f}".format(x[0]) for x in best_model.weights.tolist()])

accuracy_graphs["Random"] = bootstrap_accuracy(best_model, X_vali, y_vali)
auc_graphs["Random"] = bootstrap_auc(best_model, X_vali, y_vali)

for i in range(20):
    sgd = SGDClassifier(random_state=i + RANDOM_SEED)
    sgd.fit(X_train, y_train)
    train_score = sgd.score(X_vali, y_vali)
    if train_score > best_score or best_model is None:
        best_score = train_score
        best_model = sgd
        print("sgd[{}] = {:.3}".format(i, train_score))

accuracy_graphs["SGD"] = bootstrap_accuracy(best_model, X_vali, y_vali)
auc_graphs["SGD"] = bootstrap_auc(best_model, X_vali, y_vali)


def mini_ca():
Ejemplo n.º 2
0
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# import data; choose feature space
from dataset_poetry import y_train, Xd_train, y_vali, Xd_vali

X_train = Xd_train["numeric"]
X_vali = Xd_vali["numeric"]

#%%
from sklearn.linear_model import LogisticRegression

m = LogisticRegression(random_state=RANDOM_SEED, penalty="none", max_iter=2000)
m.fit(X_train, y_train)

print("skLearn-LR AUC: {:.3}".format(np.mean(bootstrap_auc(m, X_vali,
                                                           y_vali))))
print("skLearn-LR Acc: {:.3}".format(m.score(X_vali, y_vali)))


def nearly_eq(x, y, tolerance=1e-6):
    return abs(x - y) < tolerance


#%%

(N, D) = X_train.shape

X = torch.from_numpy(X_train).float()
y = torch.from_numpy(y_train).long()
Xv = torch.from_numpy(X_vali).float()
yv = torch.from_numpy(y_vali).long()
Ejemplo n.º 3
0
#%% try sklearn MultinomialNB:

## SKLearn has it's own Multinomial Naive Bayes,
#  and it uses the alpha / additive smoothing to deal with zeros!
from sklearn.naive_bayes import MultinomialNB

# Try a couple alpha values (what to do with zero-prob words!)
# Alpha can really be anything positive!
for alpha in [0.05, 0.1, 1.0, 10.0, 50.0]:
    m = MultinomialNB(alpha=alpha)
    m.fit(X_train, y_train)
    scores = m.predict_proba(X_vali)[:, 1]
    print("Accuracy: {:.3}, AUC: {:.3}".format(
        m.score(X_vali, y_vali), roc_auc_score(y_score=scores, y_true=y_vali)))
    print("What I called log(beta)={}".format(m.class_log_prior_[1]))
    results["MNB(alpha={})".format(alpha)] = bootstrap_auc(m, X_vali, y_vali)

#%% Showcase linar smoothing:

from collections import Counter
import typing


# P(x|POETRY) / P(x|EVERYTHING) > some random constant?
@dataclass
class CountLanguageModel:
    """ The number of times each word has been observed. """

    counts: typing.Counter[str] = field(default_factory=Counter)
    # default_factory: zero-argument callable that will be called when a default value is needed for this field
    """ The total number of observed words (any word)"""
Ejemplo n.º 4
0
)
vali_sX = np.hstack(
    [
        best_textual.m.predict_proba(vali_xd["textual"]),
        vali_xd["numeric"],
    ]
)
test_sX = np.hstack(
    [
        best_textual.m.predict_proba(test_xd["textual"]),
        test_xd["numeric"],
    ]
)


stacked = LogisticRegression(random_state=RAND)
stacked.fit((train_sX), train_y)

graphs = {
    "textual": bootstrap_auc(best_textual.m, test_xd["textual"], test_y),
    "numeric": bootstrap_auc(best_numeric.m, test_xd["numeric"], test_y),
    "merged": bootstrap_auc(best_merged.m, test_xd["merged"], test_y),
    "stacked": bootstrap_auc(stacked, test_sX, test_y),
}

simple_boxplot(
    graphs, ylabel="AUC", xlabel="method", save="graphs/p10-early-vs-stacked.png"
)

# %%
    for _ in range(num_iter):
        for _ in range(n_samples):
            X_mb, y_mb = resample(X_train, y_train, n_samples=minibatch_size)
            m.weights += alpha * compute_gradient_update(m, X_mb, y_mb)
        # record performance:
        plot.add_sample(m, X_train, y_train, X_vali, y_vali)

    return m


# 2. pick a smaller max_iter that gets good performance.
# When num_iter is 1000, both the training and validation curve has flattened out more or less

for alpha in [0.05, 0.1, 0.5, 1.0, 2.0]:
    m = train_logistic_regression_sgd_opt("LR-SGD", alpha, num_iter=1000)
    print("LR-SGD AUC: {:.3}".format(np.mean(bootstrap_auc(m, X_vali,
                                                           y_vali))))
    print("LR-SGD Acc: {:.3}".format(m.score(X_vali, y_vali)))

# (A) Explore Learning Rates:
#
# 3. make ``alpha``, the learning rate, a parameter of the train function.
# 4. make a graph including some faster and slower alphas
# .... what do you notice?

## Both training and validation curves move up (flatten out sooner) as alpha increases, but
# the change becomes less noticeable for alpha greater or equal to 0.5

## Create training curve plots:
import matplotlib.pyplot as plt

for key, dataset in learning_curves.items():
Ejemplo n.º 6
0
        scores = m.decision_function(X_vali)
    else:
        scores = m.predict_proba(X_vali)[:, 1]
    print("\tVali-AUC: {:.3}".format(
        roc_auc_score(y_score=scores, y_true=y_vali)))

from sklearn.utils import resample
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Is it randomness? Use simple_boxplot and bootstrap_auc/bootstrap_acc to see if the differences are meaningful!
from shared import bootstrap_accuracy, bootstrap_auc
f = DecisionTreeClassifier()
f.fit(X_train, y_train)
bootstrap_acc = bootstrap_accuracy(f=f, X=X_vali, y=y_vali)
bootstrap_auc = bootstrap_auc(f=f, X=X_vali, y=y_vali)

print(bootstrap_acc[:1])
print(bootstrap_auc[:1])

plt.boxplot([bootstrap_acc, bootstrap_auc])
plt.xticks(ticks=[1, 2], labels=["bootstrap_acc", "bootstrap_auc"])
plt.xlabel("DecisionTree bootstraps")
plt.ylabel("Accuracy")
plt.ylim([0.3, 1.0])
plt.show()

# 2.D. Is it randomness? Control for random_state parameters!
"""
Results should be something like: