def continuous_info_gain(attribute, examples): sortable = list(examples) sortable.sort(cmp=lambda x,y: cmp(x.get(attribute), y.get(attribute))) index = 0 total = len(sortable) best_gain = 0 best_index = index + 1 while index < len(sortable): # Skip over elements that have the same class under this attribute current_goal = sortable[index].get_goal() while (index < len(sortable)) and \ (current_goal == sortable[index].get_goal()): index += 1 if index >= len(sortable): continue left = sortable[:index] right = sortable[index:] l_entropy = bool_entropy(dt_util.get_plurality_fraction(left)) r_entropy = bool_entropy(dt_util.get_plurality_fraction(right)) gain = (float(len(left)) / total) * l_entropy + \ (float(len(right)) / total) * r_entropy if gain > best_gain: best_gain = gain best_index = index split_point = float(sum([e.get(attribute) for e in sortable[best_index-1:best_index+1]])) / 2 return best_gain, split_point
def continuous_info_gain(attribute, examples): sortable = list(examples) sortable.sort(cmp=lambda x, y: cmp(x.get(attribute), y.get(attribute))) index = 0 total = len(sortable) best_gain = 0 best_index = index + 1 while index < len(sortable): # Skip over elements that have the same class under this attribute current_goal = sortable[index].get_goal() while (index < len(sortable)) and \ (current_goal == sortable[index].get_goal()): index += 1 if index >= len(sortable): continue left = sortable[:index] right = sortable[index:] l_entropy = bool_entropy(dt_util.get_plurality_fraction(left)) r_entropy = bool_entropy(dt_util.get_plurality_fraction(right)) gain = (float(len(left)) / total) * l_entropy + \ (float(len(right)) / total) * r_entropy if gain > best_gain: best_gain = gain best_index = index split_point = float( sum([ e.get(attribute) for e in sortable[best_index - 1:best_index + 1] ])) / 2 return best_gain, split_point
def discrete_info_gain(attribute, examples): total = len(examples) bucketed = bucket_examples_by_attribute(attribute, examples) remainder = get_remainder_over_buckets(bucketed, total) base_info = dt_util.get_plurality_fraction(examples) return base_info - remainder
def get_remainder_over_buckets(bucketed, total_count): remainder = 0 keys = bucketed.keys() for k in keys: elems = bucketed[k] weight = float(len(elems)) / total_count q = dt_util.get_plurality_fraction(elems) remainder += weight * bool_entropy(q) return remainder