def MDLPC_criterion(self, data, feature, cut_point): ''' Determines whether a partition is accepted according to the MDLPC criterion :param feature: feature of interest :param cut_point: proposed cut_point :param partition_index: index of the sample (dataframe partition) in the interval of interest :return: True/False, whether to accept the partition ''' #get dataframe only with desired attribute and class columns, and split by cut_point data_partition = data.copy(deep=True) data_left = data_partition[data_partition[feature] <= cut_point] data_right = data_partition[data_partition[feature] > cut_point] #compute information gain obtained when splitting data at cut_point cut_point_gain = cut_point_information_gain(dataset=data_partition, cut_point=cut_point, feature_label=feature, class_label=self._class_name) #compute delta term in MDLPC criterion N = len(data_partition) # number of examples in current partition partition_entropy = entropy(data_partition[self._class_name]) k = len(data_partition[self._class_name].unique()) k_left = len(data_left[self._class_name].unique()) k_right = len(data_right[self._class_name].unique()) entropy_left = entropy(data_left[self._class_name]) # entropy of partition entropy_right = entropy(data_right[self._class_name]) delta = log(3 ** k, 2) - (k * partition_entropy) + (k_left * entropy_left) + (k_right * entropy_right) #to split or not to split gain_threshold = (log(N - 1, 2) + delta) / N if cut_point_gain > gain_threshold: return True else: return False
def best_cut_point(self, data, feature): ''' Selects the best cut point for a feature in a data partition based on information gain :param data: data partition (pandas dataframe) :param feature: target attribute :return: value of cut point with highest information gain (if many, picks first). None if no candidates ''' candidates = self.boundaries_in_partition(data=data, feature=feature) # candidates = self.feature_boundary_points(data=data, feature=feature) if not candidates: return None gains = [(cut, cut_point_information_gain(dataset=data, cut_point=cut, feature_label=feature, class_label=self._class_name)) for cut in candidates] gains = sorted(gains, key=lambda x: x[1], reverse=True) return gains[0][0] #return cut point