Example #1
0
    def eval_question(self, ngram_table, pred_var_set_pair):
        """
            Evaluate question by computing the avg conditional entropy,
            reduction, belongs to and not belongs to probability
        """

        x_index, set_data = pred_var_set_pair
        question = Question()
        if self.question_already_asked(x_index, set_data):
            #The reduction is set to 0 by default for a question
            return question

        question.set = set_data
        question.predictor_variable_index = x_index
        self.count_target_word_frequencies(ngram_table, x_index, set_data, question)
        question.b_dist_entropy = self.frequencies_to_probabilities_and_entropy(question.b_dist)
        question.nb_dist_entropy = self.frequencies_to_probabilities_and_entropy(question.nb_dist)

        size_row_fragment = (
            len(self.row_fragment_indices)
        )

        question.b_probability =  0 if size_row_fragment is 0 else (
            self.probability * float(len(question.b_indices))/size_row_fragment
        )
        question.nb_probability = 0 if size_row_fragment is 0 else (
            self.probability * float(len(question.nb_indices))/size_row_fragment
        )
        question.avg_conditional_entropy = (
            (question.b_probability * question.b_dist_entropy)
            +
            (question.nb_probability * question.nb_dist_entropy)
        )
        question.reduction = (
            self.probabilistic_entropy - question.avg_conditional_entropy
        )

        return question
Example #2
0
    def generate_questions(self, ngram_table, pred_var_set_pairs_generator):
        """
            Evaluate question by computing the avg conditional entropy,
            reduction, belongs to and not belongs to probability
        """

        for pred_var_set_pair in pred_var_set_pairs_generator():
            x_index, set_index = pred_var_set_pair
            question = Question()
            if self.question_already_asked(x_index, set_index):
                # The reduction is set to 0 by default for a question
                yield question

            if self.set_known_predvars[x_index]:
                # We know what set this predictor variable belongs to in
                # this node's slice of data. So no point asking this question
                # The reduction is set to 0 by default for a question
                yield question

            question.set = set_index
            question.predictor_variable_index = x_index
            condition = self.data_fragment[:, x_index] == set_index
            question.b_fragment = self.data_fragment.compress(
                condition, axis=0)
            question.nb_fragment = (
                self.data_fragment.compress(~condition, axis=0)
            )

            target_column_index = self.data_fragment.shape[1] - 1
            b_probabilities = np.bincount(
                question.b_fragment[:, target_column_index]
            ).astype('float32') / question.b_fragment.shape[0]
            nb_probabilities = np.bincount(
                question.nb_fragment[:, target_column_index]
            ).astype('float32') / question.nb_fragment.shape[0]

            question.b_dist = {
                index: b_probabilities[index] for index in range(
                    len(b_probabilities)
                )
            }
            question.nb_dist = {
                index: nb_probabilities[index] for index in range(
                    len(nb_probabilities)
                )
            }

            question.b_dist_entropy = scipy.stats.entropy(
                b_probabilities, base=2
            )
            question.nb_dist_entropy = scipy.stats.entropy(
                nb_probabilities, base=2
            )

            size_data = (
                self.data_fragment.shape[0]
            )
            # Probability for next node in YES path computed
            question.b_probability = 0 if size_data is 0 else (
                self.probability * float(
                    question.b_fragment.shape[0]
                ) / size_data
            )
            # Probability for next node in No path computed
            question.nb_probability = 0 if size_data is 0 else (
                self.probability * float(
                    question.nb_fragment.shape[0]
                ) / size_data
            )
            # Avg conditional entropy computed for the node
            question.avg_conditional_entropy = (
                (question.b_probability * question.b_dist_entropy) +
                (question.nb_probability * question.nb_dist_entropy)
            )
            # Reduction computed for current node
            question.reduction = (
                self.probabilistic_entropy - question.avg_conditional_entropy
            )

            yield question
Example #3
0
    def generate_questions(self, ngram_table, pred_var_set_pairs_generator):
        """
            Evaluate question by computing the avg conditional entropy,
            reduction, belongs to and not belongs to probability
        """

        for pred_var_set_pair in pred_var_set_pairs_generator():
            x_index, set_index = pred_var_set_pair
            question = Question()
            if self.question_already_asked(x_index, set_index):
                # The reduction is set to 0 by default for a question
                yield question

            if self.set_known_predvars[x_index]:
                # We know what set this predictor variable belongs to in
                # this node's slice of data. So no point asking this question
                # The reduction is set to 0 by default for a question
                yield question

            question.set = set_index
            question.predictor_variable_index = x_index
            condition = self.data_fragment[:, x_index] == set_index
            question.b_fragment = self.data_fragment.compress(condition,
                                                              axis=0)
            question.nb_fragment = (self.data_fragment.compress(~condition,
                                                                axis=0))

            target_column_index = self.data_fragment.shape[1] - 1
            b_probabilities = np.bincount(
                question.b_fragment[:, target_column_index]).astype(
                    'float32') / question.b_fragment.shape[0]
            nb_probabilities = np.bincount(
                question.nb_fragment[:, target_column_index]).astype(
                    'float32') / question.nb_fragment.shape[0]

            question.b_dist = {
                index: b_probabilities[index]
                for index in range(len(b_probabilities))
            }
            question.nb_dist = {
                index: nb_probabilities[index]
                for index in range(len(nb_probabilities))
            }

            question.b_dist_entropy = scipy.stats.entropy(b_probabilities,
                                                          base=2)
            question.nb_dist_entropy = scipy.stats.entropy(nb_probabilities,
                                                           base=2)

            size_data = (self.data_fragment.shape[0])
            # Probability for next node in YES path computed
            question.b_probability = 0 if size_data is 0 else (
                self.probability * float(question.b_fragment.shape[0]) /
                size_data)
            # Probability for next node in No path computed
            question.nb_probability = 0 if size_data is 0 else (
                self.probability * float(question.nb_fragment.shape[0]) /
                size_data)
            # Avg conditional entropy computed for the node
            question.avg_conditional_entropy = (
                (question.b_probability * question.b_dist_entropy) +
                (question.nb_probability * question.nb_dist_entropy))
            # Reduction computed for current node
            question.reduction = (self.probabilistic_entropy -
                                  question.avg_conditional_entropy)

            yield question