def eval_question(self, ngram_table, pred_var_set_pair): """ Evaluate question by computing the avg conditional entropy, reduction, belongs to and not belongs to probability """ x_index, set_data = pred_var_set_pair question = Question() if self.question_already_asked(x_index, set_data): #The reduction is set to 0 by default for a question return question question.set = set_data question.predictor_variable_index = x_index self.count_target_word_frequencies(ngram_table, x_index, set_data, question) question.b_dist_entropy = self.frequencies_to_probabilities_and_entropy(question.b_dist) question.nb_dist_entropy = self.frequencies_to_probabilities_and_entropy(question.nb_dist) size_row_fragment = ( len(self.row_fragment_indices) ) question.b_probability = 0 if size_row_fragment is 0 else ( self.probability * float(len(question.b_indices))/size_row_fragment ) question.nb_probability = 0 if size_row_fragment is 0 else ( self.probability * float(len(question.nb_indices))/size_row_fragment ) question.avg_conditional_entropy = ( (question.b_probability * question.b_dist_entropy) + (question.nb_probability * question.nb_dist_entropy) ) question.reduction = ( self.probabilistic_entropy - question.avg_conditional_entropy ) return question
def generate_questions(self, ngram_table, pred_var_set_pairs_generator): """ Evaluate question by computing the avg conditional entropy, reduction, belongs to and not belongs to probability """ for pred_var_set_pair in pred_var_set_pairs_generator(): x_index, set_index = pred_var_set_pair question = Question() if self.question_already_asked(x_index, set_index): # The reduction is set to 0 by default for a question yield question if self.set_known_predvars[x_index]: # We know what set this predictor variable belongs to in # this node's slice of data. So no point asking this question # The reduction is set to 0 by default for a question yield question question.set = set_index question.predictor_variable_index = x_index condition = self.data_fragment[:, x_index] == set_index question.b_fragment = self.data_fragment.compress( condition, axis=0) question.nb_fragment = ( self.data_fragment.compress(~condition, axis=0) ) target_column_index = self.data_fragment.shape[1] - 1 b_probabilities = np.bincount( question.b_fragment[:, target_column_index] ).astype('float32') / question.b_fragment.shape[0] nb_probabilities = np.bincount( question.nb_fragment[:, target_column_index] ).astype('float32') / question.nb_fragment.shape[0] question.b_dist = { index: b_probabilities[index] for index in range( len(b_probabilities) ) } question.nb_dist = { index: nb_probabilities[index] for index in range( len(nb_probabilities) ) } question.b_dist_entropy = scipy.stats.entropy( b_probabilities, base=2 ) question.nb_dist_entropy = scipy.stats.entropy( nb_probabilities, base=2 ) size_data = ( self.data_fragment.shape[0] ) # Probability for next node in YES path computed question.b_probability = 0 if size_data is 0 else ( self.probability * float( question.b_fragment.shape[0] ) / size_data ) # Probability for next node in No path computed question.nb_probability = 0 if size_data is 0 else ( self.probability * float( question.nb_fragment.shape[0] ) / size_data ) # Avg conditional entropy computed for the node question.avg_conditional_entropy = ( (question.b_probability * question.b_dist_entropy) + (question.nb_probability * question.nb_dist_entropy) ) # Reduction computed for current node question.reduction = ( self.probabilistic_entropy - question.avg_conditional_entropy ) yield question
def generate_questions(self, ngram_table, pred_var_set_pairs_generator): """ Evaluate question by computing the avg conditional entropy, reduction, belongs to and not belongs to probability """ for pred_var_set_pair in pred_var_set_pairs_generator(): x_index, set_index = pred_var_set_pair question = Question() if self.question_already_asked(x_index, set_index): # The reduction is set to 0 by default for a question yield question if self.set_known_predvars[x_index]: # We know what set this predictor variable belongs to in # this node's slice of data. So no point asking this question # The reduction is set to 0 by default for a question yield question question.set = set_index question.predictor_variable_index = x_index condition = self.data_fragment[:, x_index] == set_index question.b_fragment = self.data_fragment.compress(condition, axis=0) question.nb_fragment = (self.data_fragment.compress(~condition, axis=0)) target_column_index = self.data_fragment.shape[1] - 1 b_probabilities = np.bincount( question.b_fragment[:, target_column_index]).astype( 'float32') / question.b_fragment.shape[0] nb_probabilities = np.bincount( question.nb_fragment[:, target_column_index]).astype( 'float32') / question.nb_fragment.shape[0] question.b_dist = { index: b_probabilities[index] for index in range(len(b_probabilities)) } question.nb_dist = { index: nb_probabilities[index] for index in range(len(nb_probabilities)) } question.b_dist_entropy = scipy.stats.entropy(b_probabilities, base=2) question.nb_dist_entropy = scipy.stats.entropy(nb_probabilities, base=2) size_data = (self.data_fragment.shape[0]) # Probability for next node in YES path computed question.b_probability = 0 if size_data is 0 else ( self.probability * float(question.b_fragment.shape[0]) / size_data) # Probability for next node in No path computed question.nb_probability = 0 if size_data is 0 else ( self.probability * float(question.nb_fragment.shape[0]) / size_data) # Avg conditional entropy computed for the node question.avg_conditional_entropy = ( (question.b_probability * question.b_dist_entropy) + (question.nb_probability * question.nb_dist_entropy)) # Reduction computed for current node question.reduction = (self.probabilistic_entropy - question.avg_conditional_entropy) yield question