def continuous_info_gain(attribute, examples):
  sortable = list(examples)
  sortable.sort(cmp=lambda x,y: cmp(x.get(attribute), y.get(attribute)))

  index = 0
  total = len(sortable)
  best_gain = 0
  best_index = index + 1
  while index < len(sortable):
    # Skip over elements that have the same class under this attribute
    current_goal = sortable[index].get_goal()
    while (index < len(sortable)) and \
          (current_goal == sortable[index].get_goal()):
      index += 1
    if index >= len(sortable):
      continue

    left = sortable[:index]
    right = sortable[index:]
    l_entropy = bool_entropy(dt_util.get_plurality_fraction(left))
    r_entropy = bool_entropy(dt_util.get_plurality_fraction(right))
    gain = (float(len(left)) / total) * l_entropy + \
           (float(len(right)) / total) * r_entropy
    if gain > best_gain:
      best_gain = gain
      best_index = index
  split_point = float(sum([e.get(attribute) for e in 
                          sortable[best_index-1:best_index+1]])) / 2

  return best_gain, split_point
Example #2
0
def continuous_info_gain(attribute, examples):
    sortable = list(examples)
    sortable.sort(cmp=lambda x, y: cmp(x.get(attribute), y.get(attribute)))

    index = 0
    total = len(sortable)
    best_gain = 0
    best_index = index + 1
    while index < len(sortable):
        # Skip over elements that have the same class under this attribute
        current_goal = sortable[index].get_goal()
        while (index < len(sortable)) and \
              (current_goal == sortable[index].get_goal()):
            index += 1
        if index >= len(sortable):
            continue

        left = sortable[:index]
        right = sortable[index:]
        l_entropy = bool_entropy(dt_util.get_plurality_fraction(left))
        r_entropy = bool_entropy(dt_util.get_plurality_fraction(right))
        gain = (float(len(left)) / total) * l_entropy + \
               (float(len(right)) / total) * r_entropy
        if gain > best_gain:
            best_gain = gain
            best_index = index
    split_point = float(
        sum([
            e.get(attribute) for e in sortable[best_index - 1:best_index + 1]
        ])) / 2

    return best_gain, split_point
def discrete_info_gain(attribute, examples):
    total = len(examples)
    bucketed = bucket_examples_by_attribute(attribute, examples)

    remainder = get_remainder_over_buckets(bucketed, total)

    base_info = dt_util.get_plurality_fraction(examples)
    return base_info - remainder
Example #4
0
def discrete_info_gain(attribute, examples):
    total = len(examples)
    bucketed = bucket_examples_by_attribute(attribute, examples)

    remainder = get_remainder_over_buckets(bucketed, total)

    base_info = dt_util.get_plurality_fraction(examples)
    return base_info - remainder
def get_remainder_over_buckets(bucketed, total_count):
    remainder = 0
    keys = bucketed.keys()
    for k in keys:
        elems = bucketed[k]
        weight = float(len(elems)) / total_count
        q = dt_util.get_plurality_fraction(elems)
        remainder += weight * bool_entropy(q)
    return remainder
Example #6
0
def get_remainder_over_buckets(bucketed, total_count):
    remainder = 0
    keys = bucketed.keys()
    for k in keys:
        elems = bucketed[k]
        weight = float(len(elems)) / total_count
        q = dt_util.get_plurality_fraction(elems)
        remainder += weight * bool_entropy(q)
    return remainder