def analyse(path, filter_fn, field_name, print_csv=False): data = load_data(path, filter_fn) occurrences = data['days'] day_of_cycle = data['day_of_cycle'] weekdays = data['weekdays'] day_of_cycle_total = sum([day_of_cycle[x] for x in day_of_cycle]) if len(occurrences) == 0: print "No tags found. Are you sure '%s' is the correct tag?" % tag return deltas = [] for d in xrange(len(occurrences)-1): delta = occurrences[d+1] - occurrences[d] if delta.days > 2: deltas.append(delta.days) if print_csv: print "date,%s" % field_name for d in date_range(occurrences[0], occurrences[len(occurrences)-1]): if d in occurrences: print str(d) + ",1" else: print str(d) + ",0" return print "===============" print "Day of cycle distribution" previous = None for k in sorted(day_of_cycle.keys()): if previous: if k - previous > 1: print ".\n." previous = k print ("Day %s:" % k).ljust(10), str(day_of_cycle[k]).ljust(4), round(day_of_cycle[k] / float(day_of_cycle_total), 2) print "===============" print "Weekday distribution" for k in sorted(weekdays.keys()): print weekday_from_int(k).ljust(5), weekdays[k] print "===============" print "Total amount of days with %s: " % field_name, len(occurrences) print "Average amount of days between %s: " % field_name, average(deltas) print "Std dev: ", std_dev(deltas) print "Last day with %s: " % field_name, occurrences[len(occurrences)-1] print "Days between today and last day with %s: " % field_name, (datetime.datetime.today().date() - occurrences[len(occurrences)-1].date()).days print "==============="
if __name__ == "__main__": filename = sys.argv[1] num_trees = int(sys.argv[2]) data, attributes, target_attr = get_data(filename) n = len(data) accs = [] for i in range(5): valid_data = data[int(float(n) / 5 * i):int(float(n) / 5 * (i + 1))] #validation data train_data = [d for d in data if not d in valid_data] #training data labels = [d[target_attr] for d in valid_data] trees = create_forest(data, attributes, target_attr, num_trees) #classify classes = [] for tree in trees: classification = classify_decision_tree(tree, valid_data, vote(labels)) classes.append(classification) classification = [vote(c) for c in zip(*classes)] count = 0 for x, y in zip(classification, labels): if x == y: count += 1 acc = float(count) / len(classification) accs.append(acc) print("accuracy: " + str(100 * acc) + "%") print("standard deviation: " + str(std_dev(accs)))