def __test__():
    import measure, datetime

    data = [
        {'a': 2, 'b': datetime.date(2003, 12, 5), 'c': 'A', 'cls': 1},
        {'a': 7, 'b': datetime.date(2004, 12, 5), 'c': 'A', 'cls': 1},
        {'a': 1, 'b': datetime.date(2007, 12, 5), 'c': 'A', 'cls': 2},
        {'a': 9, 'b': datetime.date(2008, 12, 5), 'c': 'D', 'cls': 3},
        {'a': 3, 'b': datetime.date(2009, 12, 5), 'c': 'B', 'cls': 1},
        {'a': 2, 'b': datetime.date(2010, 12, 5), 'c': 'C', 'cls': 3},
    ]

    entropy_impurity = measure.entropy(data, 'cls')
    giniidx_impurity = measure.giniidx(data, 'cls')
    cls_err_impurity = measure.cls_err(data, 'cls')

    print 'split'
    print ratio(data, 'a', 'cls', measure.entropy, entropy_impurity)
    print ratio(data, 'a', 'cls', measure.giniidx, giniidx_impurity)
    print ratio(data, 'a', 'cls', measure.cls_err, cls_err_impurity)
    print 'nosplit'
    print ratio(data, 'a', 'cls', measure.entropy, entropy_impurity, False)
    print ratio(data, 'a', 'cls', measure.giniidx, giniidx_impurity, False)
    print ratio(data, 'a', 'cls', measure.cls_err, cls_err_impurity, False)
    print

    print 'split'
    print interval(data, 'b', 'cls', measure.entropy, entropy_impurity)
    print interval(data, 'b', 'cls', measure.giniidx, giniidx_impurity)
    print interval(data, 'b', 'cls', measure.cls_err, cls_err_impurity)
    print 'nosplit'
    print interval(data, 'b', 'cls', measure.entropy, entropy_impurity, False)
    print interval(data, 'b', 'cls', measure.giniidx, giniidx_impurity, False)
    print interval(data, 'b', 'cls', measure.cls_err, cls_err_impurity, False)
    print

    print 'split'
    print ordinal(data, 'c', 'cls', measure.entropy, entropy_impurity)
    print ordinal(data, 'c', 'cls', measure.giniidx, giniidx_impurity)
    print ordinal(data, 'c', 'cls', measure.cls_err, cls_err_impurity)
    print 'nosplit'
    print ordinal(data, 'c', 'cls', measure.entropy, entropy_impurity, False)
    print ordinal(data, 'c', 'cls', measure.giniidx, giniidx_impurity, False)
    print ordinal(data, 'c', 'cls', measure.cls_err, cls_err_impurity, False)
    print

    print 'split'
    print nominal(data, 'c', 'cls', measure.entropy, entropy_impurity)
    print nominal(data, 'c', 'cls', measure.giniidx, giniidx_impurity)
    print nominal(data, 'c', 'cls', measure.cls_err, cls_err_impurity)
    print 'nosplit'
    print nominal(data, 'c', 'cls', measure.entropy, entropy_impurity, False)
    print nominal(data, 'c', 'cls', measure.giniidx, giniidx_impurity, False)
    print nominal(data, 'c', 'cls', measure.cls_err, cls_err_impurity, False)
    print
Exemple #2
0
# print all combination of k and sd
for k in xrange(2, 5):
    for sd_away in xrange(0, 5):
        try:
            kdist = dbscan.k_distance(dataset, k, sd_away)
        except:
            print 'kdist anchor out of range, skipped'
            print
            continue

        cluster = dbscan.dbscan(dataset, kdist, k)
        if len(cluster) == 0:
            print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(cluster)
            print
            continue

        cc = [measure.cls_err(c) for c in cluster]
        errs = []
        clss = []
        for err, cls in cc:
            errs.append(err)
            clss.append(cls)
        cp = [len(c) for c in cluster]

        print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(cluster)
        for i in xrange(0, len(cluster)):
            print 'cluster:', i, 'no. of pt. in cluster:', cp[i], 'impurity (classification error):', errs[i], 'majority:', clss[i]
        print 'mean impurity:', float(sum(errs)) / len(errs), 'sum of pt.:', sum(cp)
        print
Exemple #3
0
    for sd_away in xrange(0, 5):
        try:
            kdist = dbscan.k_distance(dataset, k, sd_away)
        except:
            print 'kdist anchor out of range, skipped'
            print
            continue

        cluster = dbscan.dbscan(dataset, kdist, k)
        if len(cluster) == 0:
            print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(
                cluster)
            print
            continue

        cc = [measure.cls_err(c) for c in cluster]
        errs = []
        clss = []
        for err, cls in cc:
            errs.append(err)
            clss.append(cls)
        cp = [len(c) for c in cluster]

        print 'k:', k, 'sd:', sd_away, 'kdist:', kdist, 'no. of cluster:', len(
            cluster)
        for i in xrange(0, len(cluster)):
            print 'cluster:', i, 'no. of pt. in cluster:', cp[
                i], 'impurity (classification error):', errs[
                    i], 'majority:', clss[i]
        print 'mean impurity:', float(
            sum(errs)) / len(errs), 'sum of pt.:', sum(cp)
def __test__():
    import measure, datetime

    data = [
        {
            'a': 2,
            'b': datetime.date(2003, 12, 5),
            'c': 'A',
            'cls': 1
        },
        {
            'a': 7,
            'b': datetime.date(2004, 12, 5),
            'c': 'A',
            'cls': 1
        },
        {
            'a': 1,
            'b': datetime.date(2007, 12, 5),
            'c': 'A',
            'cls': 2
        },
        {
            'a': 9,
            'b': datetime.date(2008, 12, 5),
            'c': 'D',
            'cls': 3
        },
        {
            'a': 3,
            'b': datetime.date(2009, 12, 5),
            'c': 'B',
            'cls': 1
        },
        {
            'a': 2,
            'b': datetime.date(2010, 12, 5),
            'c': 'C',
            'cls': 3
        },
    ]

    entropy_impurity = measure.entropy(data, 'cls')
    giniidx_impurity = measure.giniidx(data, 'cls')
    cls_err_impurity = measure.cls_err(data, 'cls')

    print 'split'
    print ratio(data, 'a', 'cls', measure.entropy, entropy_impurity)
    print ratio(data, 'a', 'cls', measure.giniidx, giniidx_impurity)
    print ratio(data, 'a', 'cls', measure.cls_err, cls_err_impurity)
    print 'nosplit'
    print ratio(data, 'a', 'cls', measure.entropy, entropy_impurity, False)
    print ratio(data, 'a', 'cls', measure.giniidx, giniidx_impurity, False)
    print ratio(data, 'a', 'cls', measure.cls_err, cls_err_impurity, False)
    print

    print 'split'
    print interval(data, 'b', 'cls', measure.entropy, entropy_impurity)
    print interval(data, 'b', 'cls', measure.giniidx, giniidx_impurity)
    print interval(data, 'b', 'cls', measure.cls_err, cls_err_impurity)
    print 'nosplit'
    print interval(data, 'b', 'cls', measure.entropy, entropy_impurity, False)
    print interval(data, 'b', 'cls', measure.giniidx, giniidx_impurity, False)
    print interval(data, 'b', 'cls', measure.cls_err, cls_err_impurity, False)
    print

    print 'split'
    print ordinal(data, 'c', 'cls', measure.entropy, entropy_impurity)
    print ordinal(data, 'c', 'cls', measure.giniidx, giniidx_impurity)
    print ordinal(data, 'c', 'cls', measure.cls_err, cls_err_impurity)
    print 'nosplit'
    print ordinal(data, 'c', 'cls', measure.entropy, entropy_impurity, False)
    print ordinal(data, 'c', 'cls', measure.giniidx, giniidx_impurity, False)
    print ordinal(data, 'c', 'cls', measure.cls_err, cls_err_impurity, False)
    print

    print 'split'
    print nominal(data, 'c', 'cls', measure.entropy, entropy_impurity)
    print nominal(data, 'c', 'cls', measure.giniidx, giniidx_impurity)
    print nominal(data, 'c', 'cls', measure.cls_err, cls_err_impurity)
    print 'nosplit'
    print nominal(data, 'c', 'cls', measure.entropy, entropy_impurity, False)
    print nominal(data, 'c', 'cls', measure.giniidx, giniidx_impurity, False)
    print nominal(data, 'c', 'cls', measure.cls_err, cls_err_impurity, False)
    print