Exemple #1
0
def build_tree(qs, field, scoref=entropy, ignore_fields=None, include_fields=None):
    """Build a classification decision tree

    >>> from pug.nlp.data.tobes_example import tobes_data
    >>> print_tree(build_tree(tobes_data))  # doctest: +NORMALIZE_WHITESPACE
    0:google? 
      T-> 3:21? 
          T-> {'Premium': 3}
          F-> 2:yes? 
              T-> {'Basic': 1}
              F-> {'None': 1}
      F-> 0:slashdot? 
          T-> {'None': 3}
          F-> 2:yes? 
              T-> {'Basic': 4}
              F-> 3:21? 
                  T-> {'Basic': 1}
                  F-> {'None': 3}
    """
    if ignore_fields is None:
        ignore_fields = ('pk', 'id')
    N = qs.count()
    if not N:
        return DecisionNode()
    if include_fields is None:
        include_fields = qs[0]._meta.get_all_field_names()

    current_score=scoref(qs, field)

    # Set up some variables to track the best criteria
    best_gain=0.0
    best_criteria=None
    best_sets=None

    for col in include_fields:
        if col in ignore_fields or col == field:
            continue
        # Set of unique values in this column
        # TODO: should do this once for all columns and cache it somewhere
        column_values = count_unique(qs, col)
        # Try dividing the table up for each value in this column
        for value in column_values:
            (set1, set2) = divide(qs, field=col, target=value, ignore_fields=ignore_fields, include_fields=include_fields)

            # Information improvement
            p = float(set1.count()) / N
            gain = current_score - p * scoref(set1, field) - (1 - p) * scoref(set2, field)
            if gain > best_gain and set1.count() > 0 and set2.count() > 0:
                best_gain = gain
                best_criteria = (col, value)
                best_sets = (set1, set2)

    # Create the sub branches   
    if best_gain > 0:
        trueBranch = build_tree(best_sets[0], field, ignore_fields=ignore_fields, include_fields=include_fields)
        falseBranch = build_tree(best_sets[1], field, ignore_fields=ignore_fields, include_fields=include_fields)
        return DecisionNode(col=best_criteria[0], value=best_criteria[1],
                            tb=trueBranch, fb=falseBranch)
    else:
        return DecisionNode(results=count_unique(qs, field=field)) 
Exemple #2
0
def build_tree(qs, field, scoref=entropy, ignore_fields=None, include_fields=None):
    """Build a classification decision tree

    >>> print_tree(build_tree(tobes_data))  # doctest: +NORMALIZE_WHITESPACE
    0:google? 
      T-> 3:21? 
          T-> {'Premium': 3}
          F-> 2:yes? 
              T-> {'Basic': 1}
              F-> {'None': 1}
      F-> 0:slashdot? 
          T-> {'None': 3}
          F-> 2:yes? 
              T-> {'Basic': 4}
              F-> 3:21? 
                  T-> {'Basic': 1}
                  F-> {'None': 3}
    """
    if ignore_fields is None:
        ignore_fields = ('pk', 'id')
    N = qs.count()
    if not N:
        return DecisionNode()
    if include_fields is None:
        include_fields = qs[0]._meta.get_all_field_names()

    current_score=scoref(qs, field)

    # Set up some variables to track the best criteria
    best_gain=0.0
    best_criteria=None
    best_sets=None

    for col in include_fields:
        if col in ignore_fields or col == field:
            continue
        # Set of unique values in this column
        # TODO: should do this once for all columns and cache it somewhere
        column_values = count_unique(qs, col)
        # Try dividing the table up for each value in this column
        for value in column_values:
            (set1, set2) = divide(qs, field=col, target=value, ignore_fields=ignore_fields, include_fields=include_fields)

            # Information improvement
            p = float(set1.count()) / N
            gain = current_score - p * scoref(set1, field) - (1 - p) * scoref(set2, field)
            if gain > best_gain and set1.count() > 0 and set2.count() > 0:
                best_gain = gain
                best_criteria = (col, value)
                best_sets = (set1, set2)

    # Create the sub branches   
    if best_gain > 0:
        trueBranch = build_tree(best_sets[0], field, ignore_fields=ignore_fields, include_fields=include_fields)
        falseBranch = build_tree(best_sets[1], field, ignore_fields=ignore_fields, include_fields=include_fields)
        return DecisionNode(col=best_criteria[0], value=best_criteria[1],
                            tb=trueBranch, fb=falseBranch)
    else:
        return DecisionNode(results=count_unique(qs, field=field)) 
Exemple #3
0
def entropy_and_impurity(qs, field, num_categories=2):
    """Gini impurity evaluation of predicted segmentation/categorization

    Returns a tuple of the entropy (in nats, base e bits) and the impurity ( a probability between 0 and 1 inclusive)

    Impurity is the probability or frequency with which the *wrong* category or prediction is assigned to an element.

    >>> from pug.nlp.data.tobes_example import tobes_data
    >>> entropy_and_impurity(tobes_data, -1)  # doctest: +ELLIPSIS
    (1.50524..., 0.6328125)
    """
    from math import log
    N = qs.count()
    counts = count_unique(qs, field)
    impurity = 0.0
    entropy = 0.0
    for k1 in counts:
        p1 = float(counts[k1]) / N
        if p1:
            entropy -= p1 * log(p1, num_categories)
        for k2 in counts:
            if not k1 == k2:
                p2 = float(counts[k2]) / N
                impurity += p1 * p2
    return entropy, impurity
Exemple #4
0
def gini_impurity(qs, field):
    '''Gini impurity evaluation of set of values

    Returns the probability [0, 1], that the wrong category/prediction has been assigned.
    '''
    N = qs.count()
    counts = count_unique(qs, field)
    impurity = 0.0
    for k1 in counts:
        p1 = float(counts[k1]) / N
        for k2 in counts:
            if not k1 == k2:
                p2 = float(counts[k2]) / N
                impurity += p1 * p2
    return impurity
Exemple #5
0
def gini_impurity(qs, field):
    '''Gini impurity evaluation of set of values

    Returns the probability [0, 1], that the wrong category/prediction has been assigned.
    '''
    N = qs.count()
    counts = count_unique(qs, field)
    impurity = 0.0
    for k1 in counts:
        p1 = float(counts[k1]) / N
        for k2 in counts:
            if not k1 == k2:
                p2 = float(counts[k2]) / N
                impurity += p1 * p2
    return impurity
Exemple #6
0
def entropy(qs, field, num_categories=2):
    """Total entropy (in nats, base e bits) for all the categorizations assigned

    sum(p(x) * log(p(x)) for x in count_unique(qs, field)

    Which measures how different each categorization (segmentation) is from the others
    """
    from math import log
    counts = count_unique(qs, field)
    ans = 0.0
    N = qs.count()
    for k in counts:
        p = float(counts[k]) / N
        if p:
            ans -=  p * log(p, num_categories)
    return ans
Exemple #7
0
def entropy(qs, field, num_categories=2):
    """Total entropy for all the categorizations assigned

    sum(p(x) * log(p(x)) for x in count_unique(qs, field)

    Which measures how different each categorization is from the others
    """
    from math import log
    counts = count_unique(qs, field)
    ans = 0.0
    N = qs.count()
    for k in counts:
        p = float(counts[k]) / N
        if p:
            ans -=  p * log(p, num_categories)
    return ans
Exemple #8
0
def entropy_and_impurity(qs, field, num_categories=2):
    """Gini impurity evaluation of predictions

    Returns the probability [0, 1], that the wrong category/prediction has been assigned.

    >>> entropy_and_impurity(tobes_data, -1)  # doctest: +ELLIPSIS
    (1.50524..., 0.6328125)
    """
    from math import log
    N = qs.count()
    counts = count_unique(qs, field)
    impurity = 0.0
    entropy = 0.0
    for k1 in counts:
        p1 = float(counts[k1]) / N
        if p1:
            entropy -= p1 * log(p1, num_categories)
        for k2 in counts:
            if not k1 == k2:
                p2 = float(counts[k2]) / N
                impurity += p1 * p2
    return entropy, impurity
Exemple #9
0
def prune(tree, mingain):
    # If the branches aren't leaves, then prune them
    if tree.tb.results == None:
        prune(tree.tb, mingain)
    if tree.fb.results == None:
        prune(tree.fb, mingain)
        
    # If both the subbranches are now leaves, see if they
    # should merged
    if tree.tb.results != None and tree.fb.results != None:
        # Build a combined dataset
        tb, fb = [],[]
        for v, c in tree.tb.results.items():
            tb += [[v]] * c
        for v, c in tree.fb.results.items():
            fb += [[v]] * c
        
        # Test the reduction in entropy
        delta = dt.entropy(tb + fb) - (dt.entropy(tb) + dt.entropy(fb) / 2)

        if delta < mingain:
            # Merge the branches
            tree.tb, tree.fb = None, None
            tree.results = count_unique(tb + fb)
Exemple #10
0
def prune(tree, mingain):
    # If the branches aren't leaves, then prune them
    if tree.tb.results == None:
        prune(tree.tb, mingain)
    if tree.fb.results == None:
        prune(tree.fb, mingain)

    # If both the subbranches are now leaves, see if they
    # should merged
    if tree.tb.results != None and tree.fb.results != None:
        # Build a combined dataset
        tb, fb = [], []
        for v, c in tree.tb.results.items():
            tb += [[v]] * c
        for v, c in tree.fb.results.items():
            fb += [[v]] * c

        # Test the reduction in entropy
        delta = dt.entropy(tb + fb) - (dt.entropy(tb) + dt.entropy(fb) / 2)

        if delta < mingain:
            # Merge the branches
            tree.tb, tree.fb = None, None
            tree.results = count_unique(tb + fb)
from call_center.models import CaseExchange, CaseHDTVHeader, CaseMaster

from pug.db.explore import count_unique
from pug.nlp.db_decision_tree import build_tree, print_tree


N = CaseMaster.objects.count()
UN = CaseMaster.objects.values('case_number').distinct().count()
N_ce = CaseExchange.objects.count()
UN_ce = CaseExchange.objects.values('case_number').distinct().count()
N_hdtv = CaseHDTVHeader.objects.count()
UN_hdtv = CaseHDTVHeader.objects.values('case_number').distinct().count()


un = count_unique(CaseExchange.objects.values('case_number'), 'case_number')
assert(len(un.values()) == UN_ce)
assert(sum(un.values()) == N_ce)


qs = CaseHDTVHeader.objects.filter(case_number__lt=4000000)
ex = qs.all()[0]
ex.service_calls
print_tree(build_tree(qs, field='service_calls', ignore_fields=('id', 'case_number')))
# dispatch_status:Completed ? 
#  T-> date_time:2008-09-15 12:25:34.270000? 
#    T-> {1: 1}
#    F-> date_time:2008-07-09 08:49:36.437000? 
#      T-> {0: 1}
#      F-> {None: 0}
#  F-> {None: 0}
Exemple #12
0
#test_decider.py

from call_center.models import CaseExchange, CaseHDTVHeader, CaseMaster

from pug.db.explore import count_unique
from pug.nlp.db_decision_tree import build_tree, print_tree

N = CaseMaster.objects.count()
UN = CaseMaster.objects.values('case_number').distinct().count()
N_ce = CaseExchange.objects.count()
UN_ce = CaseExchange.objects.values('case_number').distinct().count()
N_hdtv = CaseHDTVHeader.objects.count()
UN_hdtv = CaseHDTVHeader.objects.values('case_number').distinct().count()

un = count_unique(CaseExchange.objects.values('case_number'), 'case_number')
assert (len(un.values()) == UN_ce)
assert (sum(un.values()) == N_ce)

qs = CaseHDTVHeader.objects.filter(case_number__lt=4000000)
ex = qs.all()[0]
ex.service_calls
print_tree(
    build_tree(qs, field='service_calls', ignore_fields=('id', 'case_number')))
# dispatch_status:Completed ?
#  T-> date_time:2008-09-15 12:25:34.270000?
#    T-> {1: 1}
#    F-> date_time:2008-07-09 08:49:36.437000?
#      T-> {0: 1}
#      F-> {None: 0}
#  F-> {None: 0}
qs = CaseHDTVHeader.objects.filter(case_number__lt=2000000)