def main(): random.seed(1) args = parse_options() title = args.train.split("/")[-1].split(".")[0] arff = Arff(args.train) print "dataset=%s" % (title) dc = DataCollection(discretize(arff.data)) ic = InstanceCollection(dc) ic.normalize_coordinates() for i in range(10): # Hard coded 10 x 3 k_fold = ic.k_fold_stratified_cross_val(3) for j in range(3): test = squash(k_fold[0:1]) train = k_fold[2] trainXY = log_y(log_x(deepcopy(train))) testXY = log_y(log_x(deepcopy(test))) quadrants = QuadrantTree(trainXY).leaves() clusters = GRIDCLUS(quadrants, args.accept) clusters, culled_clusters = prune_clusters_classic(deepcopy(clusters), args.cull) # Step through instances one at a time for instance in testXY: if instance_in_culled(instance, culled_clusters): # Place in closest cluster with an effort score within 10% of it's original closest_cluster = [sys.maxint, None, None] for k in range(len(clusters)): for quadrant in clusters[k].quadrants: tmp_distance = distance(instance.Coord(), quadrant.center()) if tmp_distance < closest_cluster[0]: closest_cluster[0] = tmp_distance closest_cluster[1] = k closest_cluster[2] = quadrant # Guess from oracle instance.datum[-1] = instance.klass() + instance.klass() * ( random.randint(10) / 10 ) * random_element([-1, 1]) # Place in closest cluster, quadrant instances clusters[k].quadrants[index(closes_cluster[2])].instances.append(instance) else: closest_cluster = [sys.maxint, None] for k in range(len(clusters)): for quadrant in clusters[k].quadrants: tmp_distance = distance(instance.Coord(), quadrant.center()) if tmp_distance < closest_cluster[0]: closest_cluster[0] = tmp_distance closest_cluster[1] = k got = median([inst.klass() for inst in cluster.instances()]) want = instance.klass()
def __init__(self, data, goal, b=0.2, bins=10): for i in range(20): data = shuffle(data) self.data = discretize(data, bins) self.goal = goal.lower() self.brsplit(b) self.bfreq = self.freqtable(self.best) self.rfreq = self.freqtable(self.rest) self.score()
def main(): arff = Arff("data/lucene2.4.arff") first_splits = [[] for i in range(5)] for datum in arff.data: if datum[arff.headers.index('#bug')] == 0.0: first_splits[0].append(datum) elif datum[arff.headers.index('#bug')] == 1.0: first_splits[1].append(datum) elif datum[arff.headers.index('#bug')] == 2.0: first_splits[2].append(datum) elif datum[arff.headers.index('#bug')] == 3.0: first_splits[3].append(datum) elif datum[arff.headers.index('#bug')] == 4.0: first_splits[4].append(datum) disc = discretize(arff.data) dc = DataCollection(disc) ic = InstanceCollection(dc) ic.normalize_coordinates() trainXY = log_y(log_x(deepcopy(ic.instances))) quadrants = QuadrantTree(trainXY).leaves() clusters = GRIDCLUS(quadrants) for cluster in clusters: print "C: ", len(cluster.datums()) clusters_orig = [] for cluster in clusters: l = [] for datum in cluster.datums(): l.append(arff.data[disc.index(datum)]) clusters_orig.append(l) print len(clusters_orig), len(first_splits) result = [[] for i in range(len(clusters_orig))] for i in range(len(clusters_orig)): for j in range(len(first_splits)): sumt = 0.0 for k in range(len(clusters_orig[i])): if clusters_orig[i][k] in first_splits[j]: sumt += 1.0 result[i].append(sumt / (len(clusters_orig[i]) + len(first_splits[j]))) for r in result: for c in r: print "%.2f" % c, ",", print "\n"
def __init__(self, headers, instances, class_type, goal): self.ops = {"<": operator.lt, ">": operator.gt} # Operators we support for goal. self.headers = headers self.datums = discretize([inst.datum for inst in instances], 3) self.you = random_element(self.datums) self.class_type = class_type self.goal = goal self.seen_goal = {} self.seen_ngoal = {} self.buildSeen() self.scoreAttributes()
def __init__(self, data, headers, goal, b=0.2, bins=10): random.shuffle(data, random.random) self.data = discretize(data, bins) self.headers = headers self.goal = goal.lower() self.best = [] self.rest = [] self.brsplit(b) # print len(self.best) # print len(self.rest) self.bfreq = self.freqtable(self.best) # print self.bfreq self.rfreq = self.freqtable(self.rest) # print self.rfreq self.score()
def __init__(self, headers, instances, class_type, goal): self.ops = { "<": operator.lt, ">": operator.gt } # Operators we support for goal. self.headers = headers self.datums = discretize([inst.datum for inst in instances], 3) self.you = random_element(self.datums) self.class_type = class_type self.goal = goal self.seen_goal = {} self.seen_ngoal = {} self.buildSeen() self.scoreAttributes()
def main(): args = parse_options() arff = Arff(args.train[0]) arff.headers.remove("name") arff.data = remove_column(arff.data, 0) o_len = len(arff.data) print "Length of original data is:", len(arff.data) for i in range(6): print "Removed up to %.2f percent of the data." % (i * 0.1) if i != 0: arff.data = remove_x_percent(arff.data, 0.1, o_len) print "Current length of data:", len(arff.data) random.shuffle(arff.data, random.random) train = discretize(arff.data, 7) test = train[0:9] d_rules = [[] for t in test] for i in range(args.xval): dc = DataCollection(train) ic = InstanceCollection(dc) ic.normalize_coordinates() testXY = [] for datum in test: for instance in ic.instances: if instance.datum == datum: testXY.append(instance) ic.instances.remove(instance) testXY = log_x(log_y(deepcopy(testXY))) trainXY = log_x(log_y(deepcopy(ic.instances))) quad_tree = QuadrantTree(trainXY) quadrants = QuadrantTree(trainXY).leaves() clusters = GRIDCLUS(quadrants) for instance in testXY: Closest = [sys.maxint, None] for cluster in [ cluster for cluster in clusters if len(cluster.datums()) > 20 ]: for quadrant in cluster.quadrants: if distance(instance.Coord(), quadrant.center()) < Closest[0]: Closest[0] = distance(instance.Coord(), quadrant.center()) Closest[1] = cluster rules = which2n(arff.headers, Closest[1].datums()) d_rules[testXY.index(instance)].append(rules[0]) for rule_list in d_rules: unique_rules = [] rule_strings = [rule.describe_short() for rule in rule_list] for rule in rule_list: if rule.describe_short() not in unique_rules: unique_rules.append(rule.describe_short()) print d_rules.index(rule_list) for rule in unique_rules: count = float(rule_strings.count(rule)) all = float(len(rule_strings)) print rule, (count / all) * 100 print ""
score = 0.0 feared = None #print "Starting" for other_cluster in [o for o in other_clusters if len(o.datums()) > 20]: n = 2.0 dist_i = (gaps(cluster, other_cluster) / max([gaps(cluster, o) for o in other_clusters])) #sup_i = (len(cluster.datums())-len(other_cluster.datums()))/max([len(o.datums()) for o in other_clusters]) sup_i = (len(other_cluster.datums())) / max( [len(o.datums()) for o in other_clusters]) sco_i = (((other_cluster.cmedian() / max([o.cmedian() for o in other_clusters])))) cscore = (sco_i * sup_i)**n / dist_i if cscore > score: score = cscore Feared = other_cluster #print "cscore selected", cscore return other_cluster if __name__ == "__main__": arff = Arff("data/china.arff") dc = DataCollection(discretize(arff.data, 7)) ic = InstanceCollection(dc) ic.normalize_coordinates() trainXY = log_y(log_x(ic.instances)) quadrants = QuadrantTree(trainXY).leaves() clusters = GRIDCLUS(quadrants) print gaps(clusters[0], clusters[-1])
def main(): args = parse_options() arff = Arff(args.train[0]) arff.headers.remove("name") arff.data = remove_column(arff.data, 0) """ for i in range(20): random.shuffle(arff.data, random.random) datar = arff.data dc = DataCollection(discretize(datar, 7)) ic = InstanceCollection(dc) print "EAST: ", ic.east print "WEST: ", ic.west print "" """ print arff.name random.shuffle(arff.data, random.random) train = discretize(arff.data, 4) test = train[0:9] d_rules = [[] for t in test] for i in range(args.xval): dc = DataCollection(train) ic = InstanceCollection(dc) ic.normalize_coordinates() testXY = [] for datum in test: for instance in ic.instances: if instance.datum == datum: testXY.append(instance) ic.instances.remove(instance) testXY = log_x(log_y(deepcopy(testXY))) trainXY = log_x(log_y(deepcopy(ic.instances))) quad_tree = QuadrantTree(trainXY) quadrants = QuadrantTree(trainXY).leaves() clusters = GRIDCLUS(quadrants) for instance in testXY: Closest = [sys.maxint, None] for cluster in [cluster for cluster in clusters if len(cluster.datums()) > 20]: for quadrant in cluster.quadrants: if distance(instance.Coord(), quadrant.center()) < Closest[0]: Closest[0] = distance(instance.Coord(), quadrant.center()) Closest[1] = cluster rules = which2n(arff.headers, Closest[1].datums()) d_rules[testXY.index(instance)].append(rules[0]) for rule_list in d_rules: unique_rules = [] rule_strings = [rule.describe_short() for rule in rule_list] for rule in rule_list: if rule.describe_short() not in unique_rules: unique_rules.append(rule.describe_short()) print d_rules.index(rule_list) for rule in unique_rules: count = float(rule_strings.count(rule)) all = float(len(rule_strings)) print rule, (count / all) * 100 print ""
def keys2(data, headers): enough = .33 too_small = 3 bins = 5 i = 0 data = discretize(deepcopy(data), bins) datar = [] datar.append(deepcopy(data)) score = [] score.append(median(transpose(data)[-1])) treatment = [] worth_something = [] while True: best = sorted(datar[i], key=lambda datum: datum[-1])[0:int(round(len(datar[i])*enough))] if len(best) < too_small: #print "breaking 0" break rest = sorted(datar[i], key=lambda datum: datum[-1])[int(round(len(datar[i])*enough))+1:-1] B = len(best) R = len(rest) #print "B", B #print "R", R wow = -1 rx = None for a in headers[0:-2]: # all independent attributes for v in [datum[headers.index(a)] for datum in datar[i]]: b = float([datum[headers.index(a)] for datum in best].count(v)) #print "b", b if b > 0: r = float([datum[headers.index(a)] for datum in rest].count(v)) #print "r", r if (b/B) > (r/R): tmp = ((b/B)**2)/((b/B) + (r/R)) #print "tmp", tmp #worth_something.append(rx) if tmp > wow: wow = tmp rx = (a,v) #print "setting rx" print "RX", rx i += 1 datar.append([datum for datum in datar[i-1] if datum[headers.index(rx[0])] == rx[1]]) #print len(datar[i]) score.append(median(transpose(datar[i])[-1])) print score print treatment if len(datar[i]) == 0: #print "breaking 1" break elif len(datar[i]) == len(datar[i-1]): #print "breaking 2" break elif not score[i] < score[i-1]: #print "breaking 3" break else: print "setting awesome treatment" treatment.insert(0, rx) print "looping" #print worth_something return treatment
def main(): random.seed(1) args = parse_options() title = args.train.split("/")[-1].split(".")[0] arff = Arff(args.train) print "dataset, %s" % (title) dc = DataCollection(discretize(arff.data)) ic = InstanceCollection(dc) ic.normalize_coordinates() total_score_list = [] total_train_list = [] for i in range(args.xval): era_list = ic.k_fold_stratified_cross_val(int(len(arff.data)/args.era)) for era in era_list: era = log_y(log_x(deepcopy(era))) score_list = [] train_list = [] train = deepcopy(era_list[0]) era_list.remove(era_list[0]) for i in range(len(era_list)): quadrants = QuadrantTree(train).leaves() clusters = GRIDCLUS(quadrants, args.accept) clusters, culled_clusters = prune_clusters_classic(deepcopy(clusters), args.cull) # culled_rules = Bore(squash([clus.datums() for clus in culled_clusters]), arff.headers, "trueyes").top_rules(args.rules) if i+1 < len(era_list): score = DefectStats() for instance in era_list[i+1]: closest_cluster = [sys.maxint, None] for i in range(len(clusters)): for quadrant in clusters[i].quadrants: tmp_distance = distance(instance.Coord(), quadrant.center()) if tmp_distance < closest_cluster[0]: closest_cluster[0] = tmp_distance closest_cluster[1] = i modified_train = [] for quadrant in clusters[closest_cluster[1]].quadrants: modified_train.extend(quadrant.ClassCoords()) got = classify(instance.Coord(), modified_train, "DEFECT") score.Evaluate(got, instance.klass()) score_list.append(score.HarmonicMean("TRUE")) train_list.append(len(squash([clus.instances() for clus in clusters]))) if i+1 < len(era_list): train = [] for cluster in clusters: train.extend(cluster.instances()) train.extend(era_list[i+1]) del era_list total_score_list.append(score_list) total_train_list.append(train_list) total_score_list = transpose(total_score_list) for i in range(len(total_score_list)): print median(total_score_list[i]), median(total_train_list[i])
def keys2(data, headers): enough = 0.33 too_small = 3 bins = 5 i = 0 data = discretize(deepcopy(data), bins) datar = [] datar.append(deepcopy(data)) score = [] score.append(median(transpose(data)[-1])) treatment = [] worth_something = [] while True: best = sorted(datar[i], key=lambda datum: datum[-1])[0 : int(round(len(datar[i]) * enough))] if len(best) < too_small: # print "breaking 0" break rest = sorted(datar[i], key=lambda datum: datum[-1])[int(round(len(datar[i]) * enough)) + 1 : -1] B = len(best) R = len(rest) # print "B", B # print "R", R wow = -1 rx = None for a in headers[0:-2]: # all independent attributes for v in [datum[headers.index(a)] for datum in datar[i]]: b = float([datum[headers.index(a)] for datum in best].count(v)) # print "b", b if b > 0: r = float([datum[headers.index(a)] for datum in rest].count(v)) # print "r", r if (b / B) > (r / R): tmp = ((b / B) ** 2) / ((b / B) + (r / R)) # print "tmp", tmp # worth_something.append(rx) if tmp > wow: wow = tmp rx = (a, v) # print "setting rx" print "RX", rx i += 1 datar.append([datum for datum in datar[i - 1] if datum[headers.index(rx[0])] == rx[1]]) # print len(datar[i]) score.append(median(transpose(datar[i])[-1])) print score print treatment if len(datar[i]) == 0: # print "breaking 1" break elif len(datar[i]) == len(datar[i - 1]): # print "breaking 2" break elif not score[i] < score[i - 1]: # print "breaking 3" break else: print "setting awesome treatment" treatment.insert(0, rx) print "looping" # print worth_something return treatment
arffs = [] for f in files: arffs.append(Arff(f)) #for of in files: # if f is not of: # arff = Arff([f, of]) # if arff not in arffs: # arffs.append(arff) #for i in range(len(files)): # arffs.append(Arff(files[0:i+1])) arffs = list(set(arffs)) arffs = sorted(arffs, key=lambda a: len(a.data)) for a in arffs: # a.data = remove_column(a.data, 0) # a.headers.remove("dataset") # a.data = remove_column(a.data, 0) # a.headers.remove("name") # if not a.numsets > 1: # a.data = remove_column(a.data, 0) # a.headers.remove("version") print a.name,",", len(a.data) dc = DataCollection(discretize(a.data, 6)) ic = InstanceCollection(dc)
def main(): args = parse_options() arff = Arff(args.train[0]) arff.headers.remove("name") arff.data = remove_column(arff.data, 0) """ for i in range(20): random.shuffle(arff.data, random.random) datar = arff.data dc = DataCollection(discretize(datar, 7)) ic = InstanceCollection(dc) print "EAST: ", ic.east print "WEST: ", ic.west print "" """ print arff.name random.shuffle(arff.data, random.random) train = discretize(arff.data, 4) test = train[0:9] d_rules = [[] for t in test] for i in range(args.xval): dc = DataCollection(train) ic = InstanceCollection(dc) ic.normalize_coordinates() testXY = [] for datum in test: for instance in ic.instances: if instance.datum == datum: testXY.append(instance) ic.instances.remove(instance) testXY = log_x(log_y(deepcopy(testXY))) trainXY = log_x(log_y(deepcopy(ic.instances))) quad_tree = QuadrantTree(trainXY) quadrants = QuadrantTree(trainXY).leaves() clusters = GRIDCLUS(quadrants) for instance in testXY: Closest = [sys.maxint, None] for cluster in [ cluster for cluster in clusters if len(cluster.datums()) > 20 ]: for quadrant in cluster.quadrants: if distance(instance.Coord(), quadrant.center()) < Closest[0]: Closest[0] = distance(instance.Coord(), quadrant.center()) Closest[1] = cluster rules = which2n(arff.headers, Closest[1].datums()) d_rules[testXY.index(instance)].append(rules[0]) for rule_list in d_rules: unique_rules = [] rule_strings = [rule.describe_short() for rule in rule_list] for rule in rule_list: if rule.describe_short() not in unique_rules: unique_rules.append(rule.describe_short()) print d_rules.index(rule_list) for rule in unique_rules: count = float(rule_strings.count(rule)) all = float(len(rule_strings)) print rule, (count / all) * 100 print ""
def main(): args = parse_options() arff = Arff(args.train[0]) arff.headers.remove("name") arff.data = remove_column(arff.data, 0) o_len = len(arff.data) print "Length of original data is:", len(arff.data) for i in range(6): print "Removed up to %.2f percent of the data." % (i*0.1) if i != 0: arff.data = remove_x_percent(arff.data, 0.1, o_len) print "Current length of data:", len(arff.data) random.shuffle(arff.data, random.random) train = discretize(arff.data, 7) test = train[0:9] d_rules = [[] for t in test] for i in range(args.xval): dc = DataCollection(train) ic = InstanceCollection(dc) ic.normalize_coordinates() testXY = [] for datum in test: for instance in ic.instances: if instance.datum == datum: testXY.append(instance) ic.instances.remove(instance) testXY = log_x(log_y(deepcopy(testXY))) trainXY = log_x(log_y(deepcopy(ic.instances))) quad_tree = QuadrantTree(trainXY) quadrants = QuadrantTree(trainXY).leaves() clusters = GRIDCLUS(quadrants) for instance in testXY: Closest = [sys.maxint, None] for cluster in [cluster for cluster in clusters if len(cluster.datums()) > 20]: for quadrant in cluster.quadrants: if distance(instance.Coord(), quadrant.center()) < Closest[0]: Closest[0] = distance(instance.Coord(), quadrant.center()) Closest[1] = cluster rules = which2n(arff.headers, Closest[1].datums()) d_rules[testXY.index(instance)].append(rules[0]) for rule_list in d_rules: unique_rules = [] rule_strings = [rule.describe_short() for rule in rule_list] for rule in rule_list: if rule.describe_short() not in unique_rules: unique_rules.append(rule.describe_short()) print d_rules.index(rule_list) for rule in unique_rules: count = float(rule_strings.count(rule)) all = float(len(rule_strings)) print rule, (count/all)*100 print ""
if nomatch: r3.ands.append(r2and) r3.ands = sorted(r3.ands, key=lambda(x): x.forr) return r3 def diffAngle(a, b): k = similarity(a, b) if k > 1: k = 1.0 return radians_to_degrees(math.acos(k)) def similarity(a, b): return dotProduct(a, b) / (magnitude(a) * magnitude(b)) def dotProduct(a, b): if a == None: return 0 elif len(a) == 1: return a[0] * b[0] else: return (a[0] * b[0]) + dotProduct(a[1:], b[1:]) def radians_to_degrees(theta): return theta * (180 / math.pi) if __name__ == "__main__": arff = Arff("data/china.arff") rules = which2n(arff.headers, discretize(arff.data)) for rule in rules: print rule.describe()
#d += ( float(len(cq.datums())) / float(len(c.datums())) )* distance(cq.center(), oq.center()) return d def most_feared(cluster, other_clusters): score = 0.0 feared = None #print "Starting" for other_cluster in [o for o in other_clusters if len(o.datums()) > 20]: n = 2.0 dist_i = (gaps(cluster, other_cluster)/max([gaps(cluster, o) for o in other_clusters])) #sup_i = (len(cluster.datums())-len(other_cluster.datums()))/max([len(o.datums()) for o in other_clusters]) sup_i = (len(other_cluster.datums()))/max([len(o.datums()) for o in other_clusters]) sco_i = (((other_cluster.cmedian()/max([o.cmedian() for o in other_clusters])))) cscore = (sco_i*sup_i)**n/dist_i if cscore > score: score = cscore Feared = other_cluster #print "cscore selected", cscore return other_cluster if __name__=="__main__": arff = Arff("data/china.arff") dc = DataCollection(discretize(arff.data, 7)) ic = InstanceCollection(dc) ic.normalize_coordinates() trainXY = log_y(log_x(ic.instances)) quadrants = QuadrantTree(trainXY).leaves() clusters = GRIDCLUS(quadrants) print gaps(clusters[0], clusters[-1])
def main(): arff = Arff("data/china.arff") first_splits = [[] for i in range(4)] for datum in arff.data: if datum[arff.headers.index('Resource')] == 1.0: first_splits[0].append(datum) elif datum[arff.headers.index('Resource')] == 2.0: first_splits[1].append(datum) elif datum[arff.headers.index('Resource')] == 3.0: first_splits[2].append(datum) elif datum[arff.headers.index('Resource')] == 4.0: first_splits[3].append(datum) second_splits = [] for split in first_splits: md = median([datum[arff.headers.index('Duration')] for datum in arff.data]) print first_splits.index(split), md one = [] two = [] for datum in split: if datum[arff.headers.index('Duration')] < md: one.append(datum) else: two.append(datum) print len(one) print len(two) second_splits.append(one) second_splits.append(two) disc = discretize(arff.data) dc = DataCollection(disc) ic = InstanceCollection(dc) ic.normalize_coordinates() trainXY = log_y(log_x(deepcopy(ic.instances))) quadrants = QuadrantTree(trainXY).leaves() clusters = GRIDCLUS(quadrants) for cluster in clusters: print "C: ", len(cluster.datums()) clusters_orig = [] for cluster in clusters: l = [] for datum in cluster.datums(): l.append(arff.data[disc.index(datum)]) clusters_orig.append(l) print len(clusters_orig), len(second_splits) result = [[] for i in range(len(clusters_orig))] for i in range(len(clusters_orig)): for j in range(len(second_splits)): sumt = 0.0 for k in range(len(clusters_orig[i])): if clusters_orig[i][k] in second_splits[j]: sumt += 1.0 result[i].append(sumt / (len(clusters_orig[i]) + len(second_splits[j]))) for r in result: for c in r: print "%.2f" % c, ",", print "\n"
arffs = [] for f in files: arffs.append(Arff(f)) #for of in files: # if f is not of: # arff = Arff([f, of]) # if arff not in arffs: # arffs.append(arff) #for i in range(len(files)): # arffs.append(Arff(files[0:i+1])) arffs = list(set(arffs)) arffs = sorted(arffs, key=lambda a: len(a.data)) for a in arffs: # a.data = remove_column(a.data, 0) # a.headers.remove("dataset") # a.data = remove_column(a.data, 0) # a.headers.remove("name") # if not a.numsets > 1: # a.data = remove_column(a.data, 0) # a.headers.remove("version") print a.name, ",", len(a.data) dc = DataCollection(discretize(a.data, 6)) ic = InstanceCollection(dc)