def train_comp_naive_stat_bayes(X, y, **kwargs): """Train a naive bayesian classifier on the given features data, including the season average statistics for each team. This is the same as naive_stat, EXCEPT this computes the difference in each value so there are less variables to learn over. Arguments: X: The features generated by feature_gen.py to train from y: The classes kwargs: For verbosity verbose: If greater than zero, then outputs basic information about the training process during training. Default 0. n_splits: The number of folds to use during KFold cross validation Default 5. """ def printverbose(*msg): if kwargs.get('verbose', 0) > 0: print(*msg) _X, _y = get_comp_stat_inputs(X, y) printverbose('Training with {} features'.format(_X.shape[1])) printverbose('Training on {} samples'.format(_X.shape[0])) # begin the training routine. We use K-fold to estimate the accuracy and # generalization power of the models sk_fold = KFold(n_splits=kwargs.get('n_splits', 5)) cum_acc = 0 for k, (train_idx, test_idx) in enumerate(sk_fold.split(_X, _y)): clf = BayesClassifier.from_samples(MultivariateGaussianDistribution, _X[train_idx], _y[train_idx].flatten()) acc = accuracy_score(_y[test_idx].flatten(), clf.predict(_X[test_idx])) printverbose('Fold {} accuracy: {}'.format(k + 1, acc)) cum_acc = (k * cum_acc + acc) / (k + 1.) printverbose('\tCurrent cumulative accuracy:', cum_acc) print('Cumulative accuracy after {} folds: {}'.format(k + 1, cum_acc))
def bayes_classifier(self, distributions=pm.NormalDistribution, max_iter=1e8, stop_threshold=0.1): """ Bayesian Classifier for semi-supervised learning Parameters ---------- distributions : object, pomegranate object Distribution object from pomegranate package max_iter: integer, default 1e8 The number of maximum iterations stop_threshold: float, 0.1 threshold for stop. """ model = BayesClassifier.from_samples(distributions=distributions, max_iterations=max_iter, stop_threshold=stop_threshold, X=self.X, y=self.y) return model.score(self.X_t, self.y_t)
y_train = np.append(y_train, y_unlabeled, axis=0) from pomegranate import NaiveBayes, NormalDistribution, BayesClassifier from pomegranate import GeneralMixtureModel, MultivariateGaussianDistribution # model_a = NaiveBayes.from_samples(NormalDistribution, x_train[y_train != -1], y_train[y_train != -1], verbose=True, stop_threshold=0.001) # print("Naive Bayes - Supervised Learning Accuracy: {}".format((model_a.predict(x_val) == y_val).mean())) # model_b = NaiveBayes.from_samples(NormalDistribution, x_train, y_train, verbose=True, weights=None, stop_threshold=0.1, max_iterations=100) # print("Naive Bayes - Semisupervised Learning Accuracy: {}".format((model_b.predict(x_val) == y_val).mean())) model_c = BayesClassifier.from_samples(MultivariateGaussianDistribution, x_train, y_train, inertia=0.0, pseudocount=0.0, stop_threshold=0.1, max_iterations=100, verbose=True, n_jobs=1) print("Bayes Classifier - Semisupervised Learning Accuracy: {}".format( (model_c.predict(x_val) == y_val).mean())) # general mixture model d0 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 0]) d1 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 1]) d2 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 2]) d3 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
def train_temporal_comp_stat_bayes(X, y, **kwargs): """Train a bayesian model which incorporates recent game results to estimate the momentum the team currently has (also could possibly marginalize over the momentum lost by player injuries?) through the hidden state space of a trained HMM. However, note that we assume that each opponent's prior history is conditionally independent of the result of the game given the target team's prior games. Not a great assumption but better than assuming that both team's prior performances are independent of the result of the game. We could also marginalize over of the prediction of the result of the game as predicted for the opposing team by adding it its HMM features, but that still wouldn't quite be theoretically justified (although also closer) and is enormously more expensive. This specific model uses the comparative features in comp_naive_stat. Sadly, must note that pomegranate doesn't have Kalman filters, it instead discretizes the continuous space to make the HMM but it'll do. Arguments: X: The features generated by feature_gen.py to train from y: The classes kwargs: For verbosity verbose: If greater than zero, then outputs basic information about the training process during training. Default 0. n_splits: The number of folds to use during KFold cross validation Default 5. n_components: The number of hidden states to use for feature generation with the HMM. Default 2 (winning/losing) """ def printverbose(*msg): if kwargs.get('verbose', 0) > 0: print(*msg) _X, _y = get_comp_stat_inputs(X, y, keepSeriesID=True) # size of _X.shape[1] with series ID since it gets replaced by hidden state printverbose('Training with {} features'.format(_X.shape[1])) printverbose('Training on {} samples'.format(_X.shape[0])) # begin the training routine. We use K-fold to estimate the accuracy and # generalization power of the models sk_fold = KFold(n_splits=kwargs.get('n_splits', 5)) cum_acc = 0 for k, (train_idx, test_idx) in enumerate(sk_fold.split(_X, _y)): hmm = \ FeatureGenerators.HiddenSpaceGenerator(_X[train_idx], kwargs.get('n_components', 2)) # since one dimensional feature (with a different label for each # component, then we instead stack vertically across all the series # predictions then glue onto the end of the normal inputs hiddenTrain = np.vstack((np.array(hmm.predict(x)).reshape(-1, 1) for x in get_series_form(_X[train_idx]))) clf = BayesClassifier.from_samples( MultivariateGaussianDistribution, np.hstack((_X[train_idx, 1:], hiddenTrain)), _y[train_idx].flatten()) hiddenTest = np.vstack((np.array(hmm.predict(x)).reshape(-1, 1) for x in get_series_form(_X[test_idx]))) acc = accuracy_score( _y[test_idx].flatten(), clf.predict(np.hstack((_X[test_idx, 1:], hiddenTest)))) printverbose('Fold {} accuracy: {}'.format(k + 1, acc)) cum_acc = (k * cum_acc + acc) / (k + 1.) printverbose('\tCurrent cumulative accuracy:', cum_acc) print('Cumulative accuracy after {} folds: {}'.format(k + 1, cum_acc))