Example #1
0
def main():
    random.seed(1)

    args = parse_options()

    title = args.train.split("/")[-1].split(".")[0]
    arff = Arff(args.train)

    print "dataset=%s" % (title)

    dc = DataCollection(discretize(arff.data))
    ic = InstanceCollection(dc)
    ic.normalize_coordinates()

    for i in range(10):  # Hard coded 10 x 3
        k_fold = ic.k_fold_stratified_cross_val(3)

        for j in range(3):
            test = squash(k_fold[0:1])
            train = k_fold[2]

            trainXY = log_y(log_x(deepcopy(train)))
            testXY = log_y(log_x(deepcopy(test)))

            quadrants = QuadrantTree(trainXY).leaves()
            clusters = GRIDCLUS(quadrants, args.accept)

            clusters, culled_clusters = prune_clusters_classic(deepcopy(clusters), args.cull)

            # Step through instances one at a time
            for instance in testXY:
                if instance_in_culled(instance, culled_clusters):
                    # Place in closest cluster with an effort score within 10% of it's original
                    closest_cluster = [sys.maxint, None, None]
                    for k in range(len(clusters)):
                        for quadrant in clusters[k].quadrants:
                            tmp_distance = distance(instance.Coord(), quadrant.center())
                            if tmp_distance < closest_cluster[0]:
                                closest_cluster[0] = tmp_distance
                                closest_cluster[1] = k
                                closest_cluster[2] = quadrant
                        # Guess from oracle
                        instance.datum[-1] = instance.klass() + instance.klass() * (
                            random.randint(10) / 10
                        ) * random_element([-1, 1])
                        # Place in closest cluster, quadrant instances
                        clusters[k].quadrants[index(closes_cluster[2])].instances.append(instance)
                else:
                    closest_cluster = [sys.maxint, None]
                    for k in range(len(clusters)):
                        for quadrant in clusters[k].quadrants:
                            tmp_distance = distance(instance.Coord(), quadrant.center())
                            if tmp_distance < closest_cluster[0]:
                                closest_cluster[0] = tmp_distance
                                closest_cluster[1] = k

                    got = median([inst.klass() for inst in cluster.instances()])
                    want = instance.klass()
Example #2
0
 def __init__(self, data, goal, b=0.2, bins=10):
     for i in range(20):
         data = shuffle(data)
     self.data = discretize(data, bins)
     self.goal = goal.lower()
     self.brsplit(b)
     self.bfreq = self.freqtable(self.best)
     self.rfreq = self.freqtable(self.rest)
     self.score()
Example #3
0
def main():
    arff = Arff("data/lucene2.4.arff")
    first_splits = [[] for i in range(5)]

    for datum in arff.data:
        if datum[arff.headers.index('#bug')] == 0.0:
            first_splits[0].append(datum)
        elif datum[arff.headers.index('#bug')] == 1.0:
            first_splits[1].append(datum)
        elif datum[arff.headers.index('#bug')] == 2.0:
            first_splits[2].append(datum)
        elif datum[arff.headers.index('#bug')] == 3.0:
            first_splits[3].append(datum)
        elif datum[arff.headers.index('#bug')] == 4.0:
            first_splits[4].append(datum)

    disc = discretize(arff.data)
    dc = DataCollection(disc)
    ic = InstanceCollection(dc)
    ic.normalize_coordinates()

    trainXY = log_y(log_x(deepcopy(ic.instances)))
    quadrants = QuadrantTree(trainXY).leaves()
    clusters = GRIDCLUS(quadrants)

    for cluster in clusters:
        print "C: ", len(cluster.datums())

    clusters_orig = []
    for cluster in clusters:
        l = []
        for datum in cluster.datums():
            l.append(arff.data[disc.index(datum)])
        clusters_orig.append(l)

    print len(clusters_orig), len(first_splits)

    result = [[] for i in range(len(clusters_orig))]

    for i in range(len(clusters_orig)):
        for j in range(len(first_splits)):
            sumt = 0.0
            for k in range(len(clusters_orig[i])):
                if clusters_orig[i][k] in first_splits[j]:
                    sumt += 1.0
            result[i].append(sumt /
                             (len(clusters_orig[i]) + len(first_splits[j])))

    for r in result:
        for c in r:
            print "%.2f" % c, ",",
        print "\n"
Example #4
0
File: nomo.py Project: abutcher/bi
    def __init__(self, headers, instances, class_type, goal):

        self.ops = {"<": operator.lt, ">": operator.gt} # Operators we support for goal.

        self.headers = headers
        self.datums = discretize([inst.datum for inst in instances], 3)
        self.you = random_element(self.datums)
        self.class_type = class_type
        self.goal = goal
        self.seen_goal = {}
        self.seen_ngoal = {}
        self.buildSeen()
        self.scoreAttributes()
Example #5
0
def main():
    arff = Arff("data/lucene2.4.arff")
    first_splits = [[] for i in range(5)]

    for datum in arff.data:
        if datum[arff.headers.index('#bug')] == 0.0:
            first_splits[0].append(datum)
        elif datum[arff.headers.index('#bug')] == 1.0:
            first_splits[1].append(datum)
        elif datum[arff.headers.index('#bug')] == 2.0:
            first_splits[2].append(datum)
        elif datum[arff.headers.index('#bug')] == 3.0:
            first_splits[3].append(datum)
        elif datum[arff.headers.index('#bug')] == 4.0:
            first_splits[4].append(datum)            

    disc = discretize(arff.data)
    dc = DataCollection(disc)
    ic = InstanceCollection(dc)
    ic.normalize_coordinates()

    trainXY = log_y(log_x(deepcopy(ic.instances)))
    quadrants = QuadrantTree(trainXY).leaves()
    clusters = GRIDCLUS(quadrants)

    for cluster in clusters:
        print "C: ", len(cluster.datums())

    clusters_orig = []
    for cluster in clusters:
        l = []
        for datum in cluster.datums():
            l.append(arff.data[disc.index(datum)])
        clusters_orig.append(l)
        
    print len(clusters_orig), len(first_splits)
        
    result = [[] for i in range(len(clusters_orig))]
        
    for i in range(len(clusters_orig)):
        for j in range(len(first_splits)):
            sumt = 0.0
            for k in range(len(clusters_orig[i])):
                if clusters_orig[i][k] in first_splits[j]:
                    sumt += 1.0
            result[i].append(sumt / (len(clusters_orig[i]) + len(first_splits[j])))

    for r in result:
        for c in r:
            print "%.2f" % c, ",",
        print "\n"
Example #6
0
    def __init__(self, data, headers, goal, b=0.2, bins=10):
        random.shuffle(data, random.random)
        self.data = discretize(data, bins)
        self.headers = headers
        self.goal = goal.lower()
        self.best = []
        self.rest = []
        self.brsplit(b)
#        print len(self.best)
#        print len(self.rest)
        self.bfreq = self.freqtable(self.best)
#        print self.bfreq
        self.rfreq = self.freqtable(self.rest)
#        print self.rfreq
        self.score()
Example #7
0
File: nomo.py Project: timm/bi
    def __init__(self, headers, instances, class_type, goal):

        self.ops = {
            "<": operator.lt,
            ">": operator.gt
        }  # Operators we support for goal.

        self.headers = headers
        self.datums = discretize([inst.datum for inst in instances], 3)
        self.you = random_element(self.datums)
        self.class_type = class_type
        self.goal = goal
        self.seen_goal = {}
        self.seen_ngoal = {}
        self.buildSeen()
        self.scoreAttributes()
Example #8
0
def main():
    args = parse_options()
    arff = Arff(args.train[0])

    arff.headers.remove("name")
    arff.data = remove_column(arff.data, 0)
    o_len = len(arff.data)

    print "Length of original data is:", len(arff.data)

    for i in range(6):

        print "Removed up to %.2f percent of the data." % (i * 0.1)

        if i != 0:
            arff.data = remove_x_percent(arff.data, 0.1, o_len)
        print "Current length of data:", len(arff.data)

        random.shuffle(arff.data, random.random)
        train = discretize(arff.data, 7)
        test = train[0:9]

        d_rules = [[] for t in test]

        for i in range(args.xval):
            dc = DataCollection(train)
            ic = InstanceCollection(dc)
            ic.normalize_coordinates()

            testXY = []
            for datum in test:
                for instance in ic.instances:
                    if instance.datum == datum:
                        testXY.append(instance)
                        ic.instances.remove(instance)

            testXY = log_x(log_y(deepcopy(testXY)))
            trainXY = log_x(log_y(deepcopy(ic.instances)))

            quad_tree = QuadrantTree(trainXY)
            quadrants = QuadrantTree(trainXY).leaves()
            clusters = GRIDCLUS(quadrants)

            for instance in testXY:
                Closest = [sys.maxint, None]
                for cluster in [
                        cluster for cluster in clusters
                        if len(cluster.datums()) > 20
                ]:
                    for quadrant in cluster.quadrants:
                        if distance(instance.Coord(),
                                    quadrant.center()) < Closest[0]:
                            Closest[0] = distance(instance.Coord(),
                                                  quadrant.center())
                            Closest[1] = cluster
                rules = which2n(arff.headers, Closest[1].datums())
                d_rules[testXY.index(instance)].append(rules[0])

        for rule_list in d_rules:
            unique_rules = []
            rule_strings = [rule.describe_short() for rule in rule_list]
            for rule in rule_list:
                if rule.describe_short() not in unique_rules:
                    unique_rules.append(rule.describe_short())
            print d_rules.index(rule_list)
            for rule in unique_rules:
                count = float(rule_strings.count(rule))
                all = float(len(rule_strings))
                print rule, (count / all) * 100
            print ""
Example #9
0
File: gaps.py Project: timm/bi
    score = 0.0
    feared = None
    #print "Starting"
    for other_cluster in [o for o in other_clusters if len(o.datums()) > 20]:
        n = 2.0
        dist_i = (gaps(cluster, other_cluster) /
                  max([gaps(cluster, o) for o in other_clusters]))
        #sup_i = (len(cluster.datums())-len(other_cluster.datums()))/max([len(o.datums()) for o in other_clusters])
        sup_i = (len(other_cluster.datums())) / max(
            [len(o.datums()) for o in other_clusters])
        sco_i = (((other_cluster.cmedian() /
                   max([o.cmedian() for o in other_clusters]))))

        cscore = (sco_i * sup_i)**n / dist_i
        if cscore > score:
            score = cscore
            Feared = other_cluster
        #print "cscore selected", cscore
    return other_cluster


if __name__ == "__main__":
    arff = Arff("data/china.arff")
    dc = DataCollection(discretize(arff.data, 7))
    ic = InstanceCollection(dc)
    ic.normalize_coordinates()
    trainXY = log_y(log_x(ic.instances))
    quadrants = QuadrantTree(trainXY).leaves()
    clusters = GRIDCLUS(quadrants)
    print gaps(clusters[0], clusters[-1])
Example #10
0
def main():
    args = parse_options()
    arff = Arff(args.train[0])

    arff.headers.remove("name")
    arff.data = remove_column(arff.data, 0)
    """
    for i in range(20):
    random.shuffle(arff.data, random.random)
    datar = arff.data
    dc = DataCollection(discretize(datar, 7))
    ic = InstanceCollection(dc)
    print "EAST: ", ic.east
    print "WEST: ", ic.west
    print ""    
    """

    print arff.name

    random.shuffle(arff.data, random.random)
    train = discretize(arff.data, 4)
    test = train[0:9]

    d_rules = [[] for t in test]

    for i in range(args.xval):
        dc = DataCollection(train)
        ic = InstanceCollection(dc)
        ic.normalize_coordinates()

        testXY = []
        for datum in test:
            for instance in ic.instances:
                if instance.datum == datum:
                    testXY.append(instance)
                    ic.instances.remove(instance)

        testXY = log_x(log_y(deepcopy(testXY)))
        trainXY = log_x(log_y(deepcopy(ic.instances)))

        quad_tree = QuadrantTree(trainXY)
        quadrants = QuadrantTree(trainXY).leaves()
        clusters = GRIDCLUS(quadrants)

        for instance in testXY:
            Closest = [sys.maxint, None]
            for cluster in [cluster for cluster in clusters if len(cluster.datums()) > 20]:
                for quadrant in cluster.quadrants:
                    if distance(instance.Coord(), quadrant.center()) < Closest[0]:
                        Closest[0] = distance(instance.Coord(), quadrant.center())
                        Closest[1] = cluster
            rules = which2n(arff.headers, Closest[1].datums())
            d_rules[testXY.index(instance)].append(rules[0])

    for rule_list in d_rules:
        unique_rules = []
        rule_strings = [rule.describe_short() for rule in rule_list]
        for rule in rule_list:
            if rule.describe_short() not in unique_rules:
                unique_rules.append(rule.describe_short())
        print d_rules.index(rule_list)
        for rule in unique_rules:
            count = float(rule_strings.count(rule))
            all = float(len(rule_strings))
            print rule, (count / all) * 100
        print ""
Example #11
0
def keys2(data, headers):
    enough = .33
    too_small = 3
    bins = 5

    i = 0
    data = discretize(deepcopy(data), bins)
    datar = []
    datar.append(deepcopy(data))
    score = []
    score.append(median(transpose(data)[-1]))
    treatment = []
    worth_something = []
    
    while True:
        best = sorted(datar[i], key=lambda datum: datum[-1])[0:int(round(len(datar[i])*enough))]

        if len(best) < too_small:
            #print "breaking 0"
            break

        rest = sorted(datar[i], key=lambda datum: datum[-1])[int(round(len(datar[i])*enough))+1:-1]

        B = len(best)
        R = len(rest)
        #print "B", B
        #print "R", R
        wow = -1
        rx = None

        for a in headers[0:-2]: # all independent attributes
            for v in [datum[headers.index(a)] for datum in datar[i]]:
                b = float([datum[headers.index(a)] for datum in best].count(v))
                #print "b", b
                if b > 0:
                    r = float([datum[headers.index(a)] for datum in rest].count(v))
                    #print "r", r
                    if (b/B) > (r/R):
                        tmp = ((b/B)**2)/((b/B) + (r/R))
                        #print "tmp", tmp
                        #worth_something.append(rx)
                        if tmp > wow:
                            wow = tmp
                            rx = (a,v)
                            #print "setting rx"
                            print "RX", rx

        i += 1
        datar.append([datum for datum in datar[i-1] if datum[headers.index(rx[0])] == rx[1]])
        #print len(datar[i])
        score.append(median(transpose(datar[i])[-1]))
        print score
        print treatment
        if len(datar[i]) == 0:
            #print "breaking 1"
            break
        elif len(datar[i]) == len(datar[i-1]):
            #print "breaking 2"
            break
        elif not score[i] < score[i-1]:
            #print "breaking 3"
            break
        else:
            print "setting awesome treatment"
            treatment.insert(0, rx)
        print "looping"
    #print worth_something
    return treatment
Example #12
0
def main():
    random.seed(1)
    
    args = parse_options()
    
    title = args.train.split("/")[-1].split(".")[0]
    arff = Arff(args.train)

    print "dataset, %s" % (title)

    dc = DataCollection(discretize(arff.data))
    ic = InstanceCollection(dc)
    ic.normalize_coordinates()

    total_score_list = []
    total_train_list = []
    for i in range(args.xval):
        
        era_list = ic.k_fold_stratified_cross_val(int(len(arff.data)/args.era))
        for era in era_list:
            era = log_y(log_x(deepcopy(era)))
        score_list = []
        train_list = []

        train = deepcopy(era_list[0])
        era_list.remove(era_list[0])
    
        for i in range(len(era_list)):
            quadrants = QuadrantTree(train).leaves()
            clusters = GRIDCLUS(quadrants, args.accept)
            
            clusters, culled_clusters = prune_clusters_classic(deepcopy(clusters), args.cull)
            
#            culled_rules = Bore(squash([clus.datums() for clus in culled_clusters]), arff.headers, "trueyes").top_rules(args.rules)

            if i+1 < len(era_list):
                score = DefectStats()
                for instance in era_list[i+1]:
                    closest_cluster = [sys.maxint, None]
                    for i in range(len(clusters)):
                        for quadrant in clusters[i].quadrants:
                            tmp_distance = distance(instance.Coord(), quadrant.center())
                            if tmp_distance < closest_cluster[0]:
                                closest_cluster[0] = tmp_distance
                                closest_cluster[1] = i
                            
                        modified_train = []
                        for quadrant in clusters[closest_cluster[1]].quadrants:
                            modified_train.extend(quadrant.ClassCoords())

                        got = classify(instance.Coord(), modified_train, "DEFECT")
                        score.Evaluate(got, instance.klass())
                score_list.append(score.HarmonicMean("TRUE"))
                train_list.append(len(squash([clus.instances() for clus in clusters])))

            if i+1 < len(era_list):
                train = []
                for cluster in clusters:
                    train.extend(cluster.instances())
                train.extend(era_list[i+1])

        del era_list
        total_score_list.append(score_list)
        total_train_list.append(train_list)

    total_score_list = transpose(total_score_list)
    for i in range(len(total_score_list)):
        print median(total_score_list[i]), median(total_train_list[i])                                  
Example #13
0
def keys2(data, headers):
    enough = 0.33
    too_small = 3
    bins = 5

    i = 0
    data = discretize(deepcopy(data), bins)
    datar = []
    datar.append(deepcopy(data))
    score = []
    score.append(median(transpose(data)[-1]))
    treatment = []
    worth_something = []

    while True:
        best = sorted(datar[i], key=lambda datum: datum[-1])[0 : int(round(len(datar[i]) * enough))]

        if len(best) < too_small:
            # print "breaking 0"
            break

        rest = sorted(datar[i], key=lambda datum: datum[-1])[int(round(len(datar[i]) * enough)) + 1 : -1]

        B = len(best)
        R = len(rest)
        # print "B", B
        # print "R", R
        wow = -1
        rx = None

        for a in headers[0:-2]:  # all independent attributes
            for v in [datum[headers.index(a)] for datum in datar[i]]:
                b = float([datum[headers.index(a)] for datum in best].count(v))
                # print "b", b
                if b > 0:
                    r = float([datum[headers.index(a)] for datum in rest].count(v))
                    # print "r", r
                    if (b / B) > (r / R):
                        tmp = ((b / B) ** 2) / ((b / B) + (r / R))
                        # print "tmp", tmp
                        # worth_something.append(rx)
                        if tmp > wow:
                            wow = tmp
                            rx = (a, v)
                            # print "setting rx"
                            print "RX", rx

        i += 1
        datar.append([datum for datum in datar[i - 1] if datum[headers.index(rx[0])] == rx[1]])
        # print len(datar[i])
        score.append(median(transpose(datar[i])[-1]))
        print score
        print treatment
        if len(datar[i]) == 0:
            # print "breaking 1"
            break
        elif len(datar[i]) == len(datar[i - 1]):
            # print "breaking 2"
            break
        elif not score[i] < score[i - 1]:
            # print "breaking 3"
            break
        else:
            print "setting awesome treatment"
            treatment.insert(0, rx)
        print "looping"
    # print worth_something
    return treatment
Example #14
0
arffs = []

for f in files:
    arffs.append(Arff(f))
    #for of in files:
    #    if f is not of:
    #        arff = Arff([f, of])
    #        if arff not in arffs:
    #            arffs.append(arff)

#for i in range(len(files)):
#    arffs.append(Arff(files[0:i+1]))

arffs = list(set(arffs))
arffs = sorted(arffs, key=lambda a: len(a.data))

for a in arffs:
#    a.data = remove_column(a.data, 0)
#    a.headers.remove("dataset")

#    a.data = remove_column(a.data, 0)
#    a.headers.remove("name")

#    if not a.numsets > 1:
#        a.data = remove_column(a.data, 0)
#        a.headers.remove("version")
    print a.name,",", len(a.data)

    dc = DataCollection(discretize(a.data, 6))
    ic = InstanceCollection(dc)
Example #15
0
def main():
    args = parse_options()
    arff = Arff(args.train[0])

    arff.headers.remove("name")
    arff.data = remove_column(arff.data, 0)
    """
    for i in range(20):
    random.shuffle(arff.data, random.random)
    datar = arff.data
    dc = DataCollection(discretize(datar, 7))
    ic = InstanceCollection(dc)
    print "EAST: ", ic.east
    print "WEST: ", ic.west
    print ""    
    """

    print arff.name

    random.shuffle(arff.data, random.random)
    train = discretize(arff.data, 4)
    test = train[0:9]

    d_rules = [[] for t in test]

    for i in range(args.xval):
        dc = DataCollection(train)
        ic = InstanceCollection(dc)
        ic.normalize_coordinates()

        testXY = []
        for datum in test:
            for instance in ic.instances:
                if instance.datum == datum:
                    testXY.append(instance)
                    ic.instances.remove(instance)

        testXY = log_x(log_y(deepcopy(testXY)))
        trainXY = log_x(log_y(deepcopy(ic.instances)))

        quad_tree = QuadrantTree(trainXY)
        quadrants = QuadrantTree(trainXY).leaves()
        clusters = GRIDCLUS(quadrants)

        for instance in testXY:
            Closest = [sys.maxint, None]
            for cluster in [
                    cluster for cluster in clusters
                    if len(cluster.datums()) > 20
            ]:
                for quadrant in cluster.quadrants:
                    if distance(instance.Coord(),
                                quadrant.center()) < Closest[0]:
                        Closest[0] = distance(instance.Coord(),
                                              quadrant.center())
                        Closest[1] = cluster
            rules = which2n(arff.headers, Closest[1].datums())
            d_rules[testXY.index(instance)].append(rules[0])

    for rule_list in d_rules:
        unique_rules = []
        rule_strings = [rule.describe_short() for rule in rule_list]
        for rule in rule_list:
            if rule.describe_short() not in unique_rules:
                unique_rules.append(rule.describe_short())
        print d_rules.index(rule_list)
        for rule in unique_rules:
            count = float(rule_strings.count(rule))
            all = float(len(rule_strings))
            print rule, (count / all) * 100
        print ""
Example #16
0
def main():
    args = parse_options()
    arff = Arff(args.train[0])

    arff.headers.remove("name")
    arff.data = remove_column(arff.data, 0)
    o_len = len(arff.data)

    print "Length of original data is:", len(arff.data)

    for i in range(6):

        print "Removed up to %.2f percent of the data." % (i*0.1)

        if i != 0:
            arff.data = remove_x_percent(arff.data, 0.1, o_len)
        print "Current length of data:", len(arff.data)

        random.shuffle(arff.data, random.random)
        train = discretize(arff.data, 7)
        test = train[0:9]

        d_rules = [[] for t in test]

        for i in range(args.xval):
            dc = DataCollection(train) 
            ic = InstanceCollection(dc)
            ic.normalize_coordinates()

            testXY = []
            for datum in test:
                for instance in ic.instances:
                    if instance.datum == datum:
                        testXY.append(instance)
                        ic.instances.remove(instance)

            testXY = log_x(log_y(deepcopy(testXY)))
            trainXY = log_x(log_y(deepcopy(ic.instances)))

            quad_tree = QuadrantTree(trainXY)
            quadrants = QuadrantTree(trainXY).leaves()
            clusters = GRIDCLUS(quadrants)

            for instance in testXY:
                Closest = [sys.maxint, None]
                for cluster in [cluster for cluster in clusters if len(cluster.datums()) > 20]:
                    for quadrant in cluster.quadrants:
                        if distance(instance.Coord(), quadrant.center()) < Closest[0]:
                            Closest[0] = distance(instance.Coord(), quadrant.center())
                            Closest[1] = cluster
                rules = which2n(arff.headers, Closest[1].datums())
                d_rules[testXY.index(instance)].append(rules[0])

        for rule_list in d_rules:
            unique_rules = []
            rule_strings = [rule.describe_short() for rule in rule_list]
            for rule in rule_list:
                if rule.describe_short() not in unique_rules:
                    unique_rules.append(rule.describe_short())
            print d_rules.index(rule_list)
            for rule in unique_rules:
                count = float(rule_strings.count(rule))
                all = float(len(rule_strings))
                print rule, (count/all)*100
            print ""
Example #17
0
File: which2.py Project: timm/bi
        if nomatch:
            r3.ands.append(r2and)
    r3.ands = sorted(r3.ands, key=lambda(x): x.forr)
    return r3

def diffAngle(a, b):
    k = similarity(a, b)
    if k > 1:
        k = 1.0
    return radians_to_degrees(math.acos(k))

def similarity(a, b):
    return dotProduct(a, b) / (magnitude(a) * magnitude(b))

def dotProduct(a, b):
    if a == None:
        return 0
    elif len(a) == 1:
        return a[0] * b[0]
    else:
        return (a[0] * b[0]) + dotProduct(a[1:], b[1:])

def radians_to_degrees(theta):
    return theta * (180 / math.pi)

if __name__ == "__main__":
    arff = Arff("data/china.arff")
    rules = which2n(arff.headers, discretize(arff.data))
    for rule in rules:
        print rule.describe()
Example #18
0
File: gaps.py Project: abutcher/bi
            #d += ( float(len(cq.datums())) / float(len(c.datums())) )* distance(cq.center(), oq.center())
    return d

def most_feared(cluster, other_clusters):
    score = 0.0
    feared = None
    #print "Starting"
    for other_cluster in [o for o in other_clusters if len(o.datums()) > 20]:
        n = 2.0
        dist_i = (gaps(cluster, other_cluster)/max([gaps(cluster, o) for o in other_clusters]))
        #sup_i = (len(cluster.datums())-len(other_cluster.datums()))/max([len(o.datums()) for o in other_clusters])
        sup_i = (len(other_cluster.datums()))/max([len(o.datums()) for o in other_clusters])
        sco_i = (((other_cluster.cmedian()/max([o.cmedian() for o in other_clusters]))))
                 
        cscore = (sco_i*sup_i)**n/dist_i
        if cscore > score:
            score = cscore
            Feared = other_cluster
        #print "cscore selected", cscore
    return other_cluster

if __name__=="__main__":
    arff = Arff("data/china.arff")
    dc = DataCollection(discretize(arff.data, 7))
    ic = InstanceCollection(dc)
    ic.normalize_coordinates()
    trainXY = log_y(log_x(ic.instances))
    quadrants = QuadrantTree(trainXY).leaves()
    clusters = GRIDCLUS(quadrants)
    print gaps(clusters[0], clusters[-1])
Example #19
0
def main():
    arff = Arff("data/china.arff")
    first_splits = [[] for i in range(4)]

    for datum in arff.data:
        if datum[arff.headers.index('Resource')] == 1.0:
            first_splits[0].append(datum)
        elif datum[arff.headers.index('Resource')] == 2.0:
            first_splits[1].append(datum)
        elif datum[arff.headers.index('Resource')] == 3.0:
            first_splits[2].append(datum)
        elif datum[arff.headers.index('Resource')] == 4.0:
            first_splits[3].append(datum)

    second_splits = []

    for split in first_splits:
        md = median([datum[arff.headers.index('Duration')] for datum in arff.data])
        print first_splits.index(split), md
        one = []
        two = []
        for datum in split:
            if datum[arff.headers.index('Duration')] < md:
                one.append(datum)
            else:
                two.append(datum)
        print len(one)
        print len(two)
        second_splits.append(one)
        second_splits.append(two)

    disc = discretize(arff.data)
    dc = DataCollection(disc)
    ic = InstanceCollection(dc)
    ic.normalize_coordinates()

    trainXY = log_y(log_x(deepcopy(ic.instances)))
    quadrants = QuadrantTree(trainXY).leaves()
    clusters = GRIDCLUS(quadrants)

    for cluster in clusters:
        print "C: ", len(cluster.datums())

    clusters_orig = []
    for cluster in clusters:
        l = []
        for datum in cluster.datums():
            l.append(arff.data[disc.index(datum)])
        clusters_orig.append(l)
        
    print len(clusters_orig), len(second_splits)
        
    result = [[] for i in range(len(clusters_orig))]
        
    for i in range(len(clusters_orig)):
        for j in range(len(second_splits)):
            sumt = 0.0
            for k in range(len(clusters_orig[i])):
                if clusters_orig[i][k] in second_splits[j]:
                    sumt += 1.0
            result[i].append(sumt / (len(clusters_orig[i]) + len(second_splits[j])))

    for r in result:
        for c in r:
            print "%.2f" % c, ",",
        print "\n"
Example #20
0
arffs = []

for f in files:
    arffs.append(Arff(f))
    #for of in files:
    #    if f is not of:
    #        arff = Arff([f, of])
    #        if arff not in arffs:
    #            arffs.append(arff)

#for i in range(len(files)):
#    arffs.append(Arff(files[0:i+1]))

arffs = list(set(arffs))
arffs = sorted(arffs, key=lambda a: len(a.data))

for a in arffs:
    #    a.data = remove_column(a.data, 0)
    #    a.headers.remove("dataset")

    #    a.data = remove_column(a.data, 0)
    #    a.headers.remove("name")

    #    if not a.numsets > 1:
    #        a.data = remove_column(a.data, 0)
    #        a.headers.remove("version")
    print a.name, ",", len(a.data)

    dc = DataCollection(discretize(a.data, 6))
    ic = InstanceCollection(dc)