Example #1
0
def train_comp_naive_stat_bayes(X, y, **kwargs):
    """Train a naive bayesian classifier on the given features data, including
    the season average statistics for each team. This is the same as
    naive_stat, EXCEPT this computes the difference in each value so there are
    less variables to learn over.

    Arguments:
        X: The features generated by feature_gen.py to train from
        y: The classes
        kwargs: For verbosity
            verbose: If greater than zero, then outputs basic information
                about the training process during training. Default 0.
            n_splits: The number of folds to use during KFold cross validation
                Default 5.
    """
    def printverbose(*msg):
        if kwargs.get('verbose', 0) > 0:
            print(*msg)

    _X, _y = get_comp_stat_inputs(X, y)

    printverbose('Training with {} features'.format(_X.shape[1]))
    printverbose('Training on {} samples'.format(_X.shape[0]))

    # begin the training routine. We use K-fold to estimate the accuracy and
    # generalization power of the models
    sk_fold = KFold(n_splits=kwargs.get('n_splits', 5))
    cum_acc = 0
    for k, (train_idx, test_idx) in enumerate(sk_fold.split(_X, _y)):
        clf = BayesClassifier.from_samples(MultivariateGaussianDistribution,
                                           _X[train_idx],
                                           _y[train_idx].flatten())

        acc = accuracy_score(_y[test_idx].flatten(), clf.predict(_X[test_idx]))
        printverbose('Fold {} accuracy: {}'.format(k + 1, acc))
        cum_acc = (k * cum_acc + acc) / (k + 1.)
        printverbose('\tCurrent cumulative accuracy:', cum_acc)

    print('Cumulative accuracy after {} folds: {}'.format(k + 1, cum_acc))
Example #2
0
    def bayes_classifier(self,
                         distributions=pm.NormalDistribution,
                         max_iter=1e8,
                         stop_threshold=0.1):
        """
            Bayesian Classifier for semi-supervised learning

            Parameters
            ----------
            distributions : object, pomegranate object
                Distribution object from pomegranate package
            max_iter: integer, default 1e8
                The number of maximum iterations
            stop_threshold: float, 0.1
                threshold for stop.

        """
        model = BayesClassifier.from_samples(distributions=distributions,
                                             max_iterations=max_iter,
                                             stop_threshold=stop_threshold,
                                             X=self.X,
                                             y=self.y)
        return model.score(self.X_t, self.y_t)
Example #3
0
y_train = np.append(y_train, y_unlabeled, axis=0)

from pomegranate import NaiveBayes, NormalDistribution, BayesClassifier
from pomegranate import GeneralMixtureModel, MultivariateGaussianDistribution

# model_a = NaiveBayes.from_samples(NormalDistribution, x_train[y_train != -1], y_train[y_train != -1], verbose=True, stop_threshold=0.001)
# print("Naive Bayes - Supervised Learning Accuracy: {}".format((model_a.predict(x_val) == y_val).mean()))

# model_b = NaiveBayes.from_samples(NormalDistribution, x_train, y_train, verbose=True, weights=None, stop_threshold=0.1, max_iterations=100)
# print("Naive Bayes - Semisupervised Learning Accuracy: {}".format((model_b.predict(x_val) == y_val).mean()))

model_c = BayesClassifier.from_samples(MultivariateGaussianDistribution,
                                       x_train,
                                       y_train,
                                       inertia=0.0,
                                       pseudocount=0.0,
                                       stop_threshold=0.1,
                                       max_iterations=100,
                                       verbose=True,
                                       n_jobs=1)
print("Bayes Classifier - Semisupervised Learning Accuracy: {}".format(
    (model_c.predict(x_val) == y_val).mean()))

# general mixture model
d0 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 0])
d1 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 1])
d2 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 2])
d3 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
Example #4
0
def train_temporal_comp_stat_bayes(X, y, **kwargs):
    """Train a bayesian model which incorporates recent game results to
    estimate the momentum the team currently has (also could possibly
    marginalize over the momentum lost by player injuries?) through the hidden
    state space of a trained HMM. However, note that we assume that each
    opponent's prior history is conditionally independent of the result of the
    game given the target team's prior games. Not a great assumption but better
    than assuming that both team's prior performances are independent of the
    result of the game.

    We could also marginalize over of the prediction of the result of the game
    as predicted for the opposing team by adding it its HMM features, but that
    still wouldn't quite be theoretically justified (although also closer) and
    is enormously more expensive.

    This specific model uses the comparative features in comp_naive_stat.

    Sadly, must note that pomegranate doesn't have Kalman filters, it instead
    discretizes the continuous space to make the HMM but it'll do.

    Arguments:
        X: The features generated by feature_gen.py to train from
        y: The classes
        kwargs: For verbosity
            verbose: If greater than zero, then outputs basic information
                about the training process during training. Default 0.
            n_splits: The number of folds to use during KFold cross validation
                Default 5.
            n_components: The number of hidden states to use for feature
                generation with the HMM. Default 2 (winning/losing)
    """
    def printverbose(*msg):
        if kwargs.get('verbose', 0) > 0:
            print(*msg)

    _X, _y = get_comp_stat_inputs(X, y, keepSeriesID=True)

    # size of _X.shape[1] with series ID since it gets replaced by hidden state
    printverbose('Training with {} features'.format(_X.shape[1]))
    printverbose('Training on {} samples'.format(_X.shape[0]))

    # begin the training routine. We use K-fold to estimate the accuracy and
    # generalization power of the models
    sk_fold = KFold(n_splits=kwargs.get('n_splits', 5))
    cum_acc = 0
    for k, (train_idx, test_idx) in enumerate(sk_fold.split(_X, _y)):
        hmm = \
            FeatureGenerators.HiddenSpaceGenerator(_X[train_idx],
                                                   kwargs.get('n_components',
                                                              2))
        # since one dimensional feature (with a different label for each
        # component, then we instead stack vertically across all the series
        # predictions then glue onto the end of the normal inputs
        hiddenTrain = np.vstack((np.array(hmm.predict(x)).reshape(-1, 1)
                                 for x in get_series_form(_X[train_idx])))
        clf = BayesClassifier.from_samples(
            MultivariateGaussianDistribution,
            np.hstack((_X[train_idx, 1:], hiddenTrain)),
            _y[train_idx].flatten())

        hiddenTest = np.vstack((np.array(hmm.predict(x)).reshape(-1, 1)
                                for x in get_series_form(_X[test_idx])))
        acc = accuracy_score(
            _y[test_idx].flatten(),
            clf.predict(np.hstack((_X[test_idx, 1:], hiddenTest))))
        printverbose('Fold {} accuracy: {}'.format(k + 1, acc))
        cum_acc = (k * cum_acc + acc) / (k + 1.)
        printverbose('\tCurrent cumulative accuracy:', cum_acc)

    print('Cumulative accuracy after {} folds: {}'.format(k + 1, cum_acc))