Ejemplo n.º 1
0
    def get_predict_data(self) -> List[PredictSample]:
        """ Get all formatted predict data useable for the machine learning algorithms to do a prediction.

        Returns:
            A list of prediction samples, where a prediction sample consists of a set of players included in the answer
            and not included in the answer. Also a prediction sample consist of the features for the participants
            included in the answer and not included in the answer.
        """
        predict_data = self.__get_season_data(self.__predict_season, self.__predict_episode, False)
        if not predict_data:
            return []

        predict_input = np.array([ExamDropEncoder.extract_features(sample, self.__predict_episode) for sample in predict_data])
        predict_input = self.__discretizer.transform(predict_input)
        predict_input = self.__add_answered_on_feature(predict_data, predict_input)
        predict_input = self.__anova_f_filter.transform(predict_input)
        predict_input = self.__pca.transform(predict_input)

        predict_samples = []
        weights = self.__get_train_weights(predict_data)
        for data, in_features, out_features, weight in zip(predict_data[::2], predict_input[1::2], predict_input[::2], weights):
            in_answer = data.answer
            out_answer = set(data.exam_episode.players).difference(data.answer)
            predict_samples.append(PredictSample(in_answer, out_answer, in_features, out_features, weight))
        return predict_samples
Ejemplo n.º 2
0
    def get_predict_data(self) -> List[PredictSample]:
        """ Get all formatted predict data useable for the machine learning algorithms to do a prediction.

        Returns:
            A list of prediction samples, where a prediction sample consists of a set of players included in the answer
            and not included in the answer. Also a prediction sample consist of the features for the participants
            included in the answer and not included in the answer.
        """
        predict_data = self.get_season_data(self.__predict_season,
                                            self.__predict_episode, False)
        if not predict_data:
            return []
        m = ExamDropEncoder.NUM_CONTINUOUS_FEATURES

        predict_input = np.array([
            ExamDropEncoder.extract_features(sample, self.__predict_episode)
            for sample in predict_data
        ])
        discrete_input = self.__discretizer.transform(predict_input[:, :-m])
        continuous_input = self.__spline_encoder.transform(predict_input[:,
                                                                         -m:])
        predict_input = np.concatenate((discrete_input, continuous_input),
                                       axis=1)

        predict_samples = []
        weights = self.__get_train_weights(predict_data)
        for data, features, weight in zip(predict_data, predict_input,
                                          weights):
            in_answer = data.answer
            out_answer = set(data.exam_episode.players).difference(data.answer)
            predict_samples.append(
                PredictSample(in_answer, out_answer, features, weight))
        return predict_samples
Ejemplo n.º 3
0
    def get_train_data(self) -> Tuple[np.array, np.array, np.array, np.array]:
        """ Get the formatted and sampled train data with train weights useable for machine learning algorithms.

        Returns:
            The train input, train output, answered_on and train weights in this order. The train input is a 2d array
            where each row represents a different train element. The train output is 1d array of labels, such that the
            ith row of the train input corresponds to the ith element of the train output. The answered on is a 1d array
            of binary values where a 1 indicates if the selected player is included in the answer and 0 otherwise. The
            train weights is an array of weights indicating how important every train element is.
        """
        train_data = []
        for season in self.__train_seasons:
            train_data.extend(self.get_season_data(season, sys.maxsize, True))
        train_input = np.array([
            ExamDropEncoder.extract_features(sample, sys.maxsize)
            for sample in train_data
        ])
        train_output = np.array([
            1.0 if get_is_mol(sample.selected_player) else 0.0
            for sample in train_data
        ])
        answered_on = np.array([
            1.0 if sample.selected_player in sample.answer else 0.0
            for sample in train_data
        ])
        m = ExamDropEncoder.NUM_CONTINUOUS_FEATURES

        self.__discretizer = StableDiscretizer(self.__min_cluster_size,
                                               self.__random_generator)
        discrete_input = self.__discretizer.fit_transform(train_input[:, :-m])
        self.__spline_encoder = NaturalSplineEncoding(
            [self.__spline_curves for _ in range(m)])
        continuous_input = self.__spline_encoder.fit_transform(
            train_input[:, -m:])
        train_input = np.concatenate((discrete_input, continuous_input),
                                     axis=1)
        return train_input, train_output, answered_on, self.__get_train_weights(
            train_data)
Ejemplo n.º 4
0
    def get_train_data(self) -> Tuple[np.array, np.array, np.array]:
        """ Get the formatted and sampled train data with train weights useable for machine learning algorithms.

        Returns:
            The train input, train output and train weights in this order. The train input is a 2d array where each row
            represents a different train element. The train output is 1d array of labels, such that the ith row of the
            train input corresponds to the ith element of the train output.
        """
        train_data = []
        for season in self.__train_seasons:
            train_data.extend(self.__get_season_data(season, sys.maxsize, True))
        train_input = np.array([ExamDropEncoder.extract_features(sample, sys.maxsize) for sample in train_data])
        train_output = np.array([1.0 if get_is_mol(sample.selected_player) else 0.0 for sample in train_data])

        num_bins = self.get_num_bins(train_input, self.__max_splits)
        self.__discretizer = KBinsDiscretizer(n_bins = num_bins, encode = "onehot-dense",
                                              strategy = ExamDropExtractor.BIN_STRATEGY)
        train_input = self.__discretizer.fit_transform(train_input)
        train_input = self.__add_answered_on_feature(train_data, train_input)
        self.__anova_f_filter = SelectFpr(f_classif, alpha = self.__anova_f_significance)
        train_input = self.__anova_f_filter.fit_transform(train_input, train_output)
        self.__pca = PCA(n_components = self.__pca_explain)
        train_input = self.__pca.fit_transform(train_input)
        return train_input, train_output, self.__get_train_weights(train_data)
Ejemplo n.º 5
0
from Layers.ExamDrop.ExamDropEncoder import ExamDropEncoder
from Layers.ExamDrop.ExamDropExtractor import ExamDropExtractor
from Tools.Encoders.NaturalSplineEncoding import NaturalSplineEncoding
from Tools.Encoders.StableDiscretizer import StableDiscretizer
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import sys

TRAIN_SEASONS = {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}
NUM_CURVES = 4

train_data = []
for season in TRAIN_SEASONS:
    train_data.extend(ExamDropExtractor.get_season_data(season, sys.maxsize, True))
train_input = np.array([ExamDropEncoder.extract_features(sample, sys.maxsize) for sample in train_data])
train_output = np.array([1.0 if get_is_mol(sample.selected_player) else 0.0 for sample in train_data])
m = ExamDropEncoder.NUM_CONTINUOUS_FEATURES

for column, feature_name in zip(train_input.T[-m:], ExamDropEncoder.FEATURE_NAMES[-m:]):
    trans_data = column[:,np.newaxis]
    spline_encoder = NaturalSplineEncoding([NUM_CURVES])
    trans_data = spline_encoder.fit_transform(trans_data)

    in_answer_input = [row for row, data in zip(trans_data, train_data) if data.selected_player in data.answer]
    in_answer_output = [to for to, data in zip(train_output, train_data) if data.selected_player in data.answer]
    regression = LinearRegression()
    regression.fit(in_answer_input, in_answer_output)

    X = np.array([pi for pi in sorted(set(column))])
    predict_input = spline_encoder.transform(X[:, np.newaxis])
Ejemplo n.º 6
0
from Data.PlayerData import get_is_mol
from Layers.ExamDrop.ExamDropEncoder import ExamDropEncoder
from Layers.ExamDrop.ExamDropExtractor import ExamDropExtractor
import numpy as np
import sys

TRAIN_SEASONS = {
    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
}

train_data = []
for season in TRAIN_SEASONS:
    train_data.extend(
        ExamDropExtractor.get_season_data(season, sys.maxsize, True))
train_input = np.array([
    ExamDropEncoder.extract_features(sample, sys.maxsize)
    for sample in train_data
])

in_answer_ratio = sum(1 for data, input in zip(train_data, train_input) if data.selected_player in data.answer) \
                  / len(train_data)
is_mol_ratio = sum(1 for data, input in zip(train_data, train_input) if get_is_mol(data.selected_player)) \
               / len(train_data)
both_ratio = sum(1 for data, input in zip(train_data, train_input)
                 if get_is_mol(data.selected_player)
                 and data.selected_player in data.answer) / len(train_data)

print("In answer ratio: " + str(in_answer_ratio))
print("Is mol ratio: " + str(is_mol_ratio))
print("Both ratio: " + str(both_ratio))
print()