Example #1
0
class ProbabilisticValidator():
    """
    # The probabilistic validator is a quick to train model used for validating the predictions of our main model
    # It is fit to the results our model gets on the validation set
    """
    _smoothing_factor = 0.5  # TODO: Autodetermine smotthing factor depending on the info we know about the dataset
    _probabilistic_model = None
    _X_buff = None
    _Y_buff = None

    def __init__(self, col_stats, data_type=None):
        """
        Chose the algorithm to use for the rest of the model
        As of right now we go with ComplementNB
        """
        self._X_buff = []
        self._Y_buff = []
        self._predicted_buckets_buff = []
        self._real_buckets_buff = []
        self._original_real_buckets_buff = []
        self._original_predicted_buckets_buff = []

        self.col_stats = col_stats

        if 'percentage_buckets' in col_stats:
            self._probabilistic_model = MultinomialNB(
                alpha=self._smoothing_factor)

            self.buckets = col_stats['percentage_buckets']
            self.bucket_keys = [i for i in range(len(self.buckets))]

            if len(self.buckets) < 3:
                self._probabilistic_model = ComplementNB(
                    alpha=self._smoothing_factor)
        else:
            self._probabilistic_model = ComplementNB(
                alpha=self._smoothing_factor)

            self.buckets = None

        self.data_type = col_stats['data_type']

        self.bucket_accuracy = {}

    def register_observation(self,
                             features_existence,
                             real_value,
                             predicted_value,
                             is_original_data=False,
                             hmd=None):
        """
        # Register an observation in the validator's internal buffers

        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        try:
            predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(
                predicted_value)
        except:
            predicted_value = None

        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(
                str(real_value).replace(',', '.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats, hmd)
            real_value_b = get_value_bucket(real_value, self.buckets,
                                            self.col_stats, hmd)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence

            self._X_buff.append(X)
            self._Y_buff.append(real_value_b)
            self._real_buckets_buff = self._Y_buff
            self._predicted_buckets_buff.append(predicted_value_b)

            if is_original_data:
                self._original_real_buckets_buff.append(real_value_b)
                self._original_predicted_buckets_buff.append(predicted_value_b)

            # If no column is ignored, compute the accuracy for this bucket
            nr_missing_features = len(
                [x for x in features_existence if x in (False, 0)])
            if nr_missing_features == 0:
                if real_value_b not in self.bucket_accuracy:
                    self.bucket_accuracy[real_value_b] = []
                self.bucket_accuracy[real_value_b].append(
                    int(real_value_b == predicted_value_b))
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self._X_buff.append(features_existence)
            self._Y_buff.append(real_value_b == predicted_value_b)
            self._real_buckets_buff.append(real_value_b)
            self._predicted_buckets_buff.append(predicted_value_b)

            if is_original_data:
                self._original_real_buckets_buff.append(real_value_b)
                self._original_predicted_buckets_buff.append(predicted_value_b)

    def get_accuracy_histogram(self):
        x = []
        y = []

        total_correct = 0
        total_vals = 0

        buckets_with_no_observations = []
        for bucket in range(len(self.buckets)):
            try:
                total_correct += sum(self.bucket_accuracy[bucket])
                total_vals += len(self.bucket_accuracy[bucket])
                y.append(
                    sum(self.bucket_accuracy[bucket]) /
                    len(self.bucket_accuracy[bucket]))
            except:
                # If no observations were made for this bucket
                buckets_with_no_observations.append(bucket)
                y.append(None)

            x.append(bucket)

        validation_set_accuracy = total_correct / total_vals
        for bucket in buckets_with_no_observations:
            y[x.index(bucket)] = validation_set_accuracy

        return {'buckets': x, 'accuracies': y}, validation_set_accuracy

    def partial_fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')

        if self.buckets is not None:
            self._probabilistic_model.partial_fit(self._X_buff,
                                                  self._Y_buff,
                                                  classes=self.bucket_keys)
        else:
            self._probabilistic_model.partial_fit(self._X_buff,
                                                  self._Y_buff,
                                                  classes=[True, False])

        np.seterr(divide=log_types['divide'])

        self._X_buff = []
        self._Y_buff = []

    def fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')
        self._probabilistic_model.fit(self._X_buff, self._Y_buff)
        np.seterr(divide=log_types['divide'])

        self._X_buff = []
        self._Y_buff = []

    def get_confusion_matrix(self):
        # The rows represent predicted values
        # The "columns" represent real values
        labels = list(set(self._original_real_buckets_buff))

        matrix = confusion_matrix(self._original_real_buckets_buff,
                                  self._original_predicted_buckets_buff,
                                  labels=labels)

        value_labels = []
        for label in labels:
            try:
                value_labels.append(str(self.buckets[label]))
            except:
                value_labels.append('UNKNOWN')

        confusion_matrix_obj = {
            'matrix': [[int(y) for y in x] for x in matrix],
            'predicted': value_labels,
            'real': value_labels
        }
        return confusion_matrix_obj

    def evaluate_prediction_accuracy(self, features_existence,
                                     predicted_value):
        """
        # Fit the probabilistic validator on an observation
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """
        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = [X + features_existence]
        else:
            X = [features_existence]

        distribution = self._probabilistic_model.predict_proba(np.array(X))[0]
        distribution = distribution.tolist()

        if len([x for x in distribution if x > 0.01]) > 4:
            # @HACK
            mean = np.mean(distribution)
            std = np.std(distribution)

            distribution = [x if x > (mean - std) else 0 for x in distribution]

            sum_dist = sum(distribution)
            # Avoid divison by zero in certain edge cases
            sum_dist = 0.00001 if sum_dist == 0 else sum_dist
            distribution = [x / sum_dist for x in distribution]

            min_val = min([x for x in distribution if x > 0.001])
            distribution = [
                x - min_val if x > min_val else 0 for x in distribution
            ]

            sum_dist = sum(distribution)
            # Avoid divison by zero in certain edge cases
            sum_dist = 0.00001 if sum_dist == 0 else sum_dist
            distribution = [x / sum_dist for x in distribution]
            # @HACK
        else:
            pass

        return ProbabilityEvaluation(self.buckets, distribution,
                                     predicted_value)
Example #2
0
class ProbabilisticValidator():
    """
    # The probabilistic validator is a quick to train model used for validating the predictions of our main model
    # It is fit to the results our model gets on the validation set
    """
    _smoothing_factor = 0.5 # TODO: Autodetermine smotthing factor depending on the info we know about the dataset
    _value_bucket_probabilities = {}
    _probabilistic_model = None
    X_buff = None
    Y_buff = None


    def __init__(self, col_stats, data_type=None):
        """
        Chose the algorithm to use for the rest of the model
        As of right now we go with ComplementNB
        """
        # <--- Pick one of the 3
        self._probabilistic_model = ComplementNB(alpha=self._smoothing_factor)
        #, class_prior=[0.5,0.5]
        #self._probabilistic_model = GaussianNB(var_smoothing=1)
        #self._probabilistic_model = MultinomialNB(alpha=self._smoothing_factor)
        self.X_buff = []
        self.Y_buff = []

        self.col_stats = col_stats

        if 'percentage_buckets' in col_stats:
            self.buckets = col_stats['percentage_buckets']
            self.bucket_keys = [i for i in range(len(self.buckets))]
        else:
            self.buckets = None

        self.data_type = col_stats['data_type']

        self.bucket_accuracy = {

        }

    def register_observation(self, features_existence, real_value, predicted_value):
        """
        # Register an observation in the validator's internal buffers

        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        nr_missing_features = len([x for x in features_existence if x is False or x is 0])

        predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence
            self.X_buff.append(X)
            self.Y_buff.append(real_value_b)

            # If no column is ignored, compute the accuracy for this bucket
            if nr_missing_features == 0:
                if predicted_value_b not in self.bucket_accuracy:
                    self.bucket_accuracy[predicted_value_b] = []
                self.bucket_accuracy[predicted_value_b].append(int(real_value_b == predicted_value_b))
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self.X_buff.append(features_existence)
            self.Y_buff.append(real_value_b == predicted_value_b)

    def get_accuracy_histogram(self):
        x = []
        y = []

        total_correct = 0
        total_vals = 0

        for bucket in self.bucket_accuracy:
            total_correct += sum(self.bucket_accuracy[bucket])
            total_vals += len(self.bucket_accuracy[bucket])
            x.append(bucket)
            y.append(sum(self.bucket_accuracy[bucket])/len(self.bucket_accuracy[bucket]))

        validation_set_accuracy = total_correct/total_vals
        return {
            'buckets': x
            ,'accuracies': y
        }, validation_set_accuracy


    def partial_fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')

        if self.buckets is not None:
            self._probabilistic_model.partial_fit(self.X_buff, self.Y_buff, classes=self.bucket_keys)
        else:
            self._probabilistic_model.partial_fit(self.X_buff, self.Y_buff, classes=[True, False])

        np.seterr(divide=log_types['divide'])

        self.X_buff= []
        self.Y_buff= []

    def fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')
        self._probabilistic_model.fit(self.X_buff, self.Y_buff)
        np.seterr(divide=log_types['divide'])

        self.X_buff= []
        self.Y_buff= []

    def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        """
        # Fit the probabilistic validator on an observation    def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """
        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = [X + features_existence]
        else:
            X = [features_existence]

        #X = [[predicted_value_b, *features_existence]]
        log_types = np.seterr()
        np.seterr(divide='ignore')
        distribution = self._probabilistic_model.predict_proba(np.array(X))
        np.seterr(divide=log_types['divide'])

        if self.buckets is not None:
            return ProbabilityEvaluation(self.buckets, distribution[0].tolist(), predicted_value).most_likely_probability
        else:
            return distribution[0][1]