Esempio n. 1
0
def est_all_data(frequent_items, total_transactions):
    print 'est all data2'
    start = time()
    transactions = None
    #transactions = parser.parse_csv_to_mat('/Users/ahkj/Dropbox/SAAS/data/csv/sample-big/customers.txt')
    all_frequent_items = fpgrowth(transactions, supp=-10, min=1, max=3) #-10 yields 3437
    M, triples = filter_items(all_frequent_items)
    fp_time = time() - start
    print "Finding frequent items: {}".format(fp_time)

    est_start = time()

    est = []
    obs = []
    abs_errors = []
    max_est = 0
    max_obs = 0
    i = 0
    j = 0

    triangle_start = time()
    triangle_tree, triples = Forward.forward_compact(frequent_items)
    print 'Finding triangles done: ', (time()-triangle_start)

    # DFS the triangle tree
    for n1 in triangle_tree.keys():
        s1, s2_dict = triangle_tree[n1]
        for n2 in s2_dict.keys():
            s2, s12, s3_dict = s2_dict[n2]
            for n3 in s3_dict.keys():
                s3, s13, s23, s123 = s3_dict[n3]
                if s123 < 30:
                    continue

                e = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=20)

                est.append(e)
                obs.append(s123)
                error = abs(e-s123) / float(s123) * 100
                    
                abs_errors.append(error)

                # For plotting
                max_est = max(max_est, e)
                max_obs = max(max_obs, s123)

    with open('../tmp/est_all_data.json', 'w') as fd:
        fd.write(json.dumps(zip(est, obs)))
    with open('../tmp/est_al_data.tsv', 'w') as fd:
        fd.write('est\tobs\tkind\n')
        for index, i in enumerate(est):
            fd.write(str(est[index]) + '\t' + str(obs[index]) + '\t' + 'est/obs\n')
    # scale = 1.5

    # fig = plt.figure()
    # fig.text(0, 0, "Total running time: {} sec.".format(time()-est_start))
    avg_error = sum(abs_errors) / float(len(abs_errors))
    print 'avg error: {}'.format(avg_error)
    print 'error var: {}'.format(np.var(abs_errors))
    print 'max observed: {}'.format(max_obs)
Esempio n. 2
0
def est_all_data_disc_version(algorithm, tab_file, min_support=-30, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation on ALL DATA cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions
    sample_size = total_transactions

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    for index in range(iterations):

        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, tab_file, sample_freq_name, '-s' + str(min_support), '-n3']
        call(args)
        print 'fpgrowth on sample data (ALL DATA) done: {} secs'.format(time()-borgelt_start)


        freq = Borgelt.read_frequent_items(sample_freq_name)
        # Create ds of all observed triplets
        # Saved as sorted keys for lookup,
        # and their frequency as value
        observed = {}
        count = 0
        for item in freq:
            if len(item[0]) == 3:
                sorted_trip = triple_sort(item[0])
                # * 2, horrible hack to make Forward calculated the 
                # observed frequency correctly.
                observed[sorted_trip] = item[1][0] * 2
        print 'Total triplets observed:', len(observed)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, total_transactions)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        heurestics = []
        observations = []
        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        triangle_counts = []
        triplets = []
        pair_triple_ratios = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2))+1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]                                                                                                                                                                                                                          
                for n3 in s3_dict.keys():                                                                                                                                       
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle = (n1, n2, n3)  
                    triplets.append(triangle)

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))   

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)                                                                                                                                                                                                                                                                                                                                                                                                                                   

                    # Observed is the triple support, since sample is all data
                    obs = s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=req_depth)

                    # extrapolation estimate, does not make sense for all data
                    est2 = s123 / float(sample_size) * (total_transactions)

                    # heurestic, use max_ent for 0 triple in sample, does not make sense for all data
                    # est3 = s123 == 0 and est or est2

                    estimates.append(est)
                    # extrapolations.append(est2)
                    # heurestics.append(est3)
                    observations.append(obs)
                    triplets.append(triangle)

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs)
                    MAPE_errors.append(error)
                    # MAPE error extrapolation
                    error2 = abs(obs-est2) / math.sqrt(obs)
                    MAPE_errors_ext.append(error2)
                    # MAPE error heurestic
                    # error3 = abs(obs-est3) / float(obs) * 100
                    # MAPE_errors_heu.append(error3)

        
        del triangle_tree
        del sample_triples
                    
        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            # avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            # avg_errors_ext.append(avg_error_ext)
            
            # heurestic error
            # avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            # avg_errors_heu.append(avg_error_heu)
            
            # variance
            var_error = var(MAPE_errors)
            # var_error_ext = tvar(MAPE_errors_ext)
            # var_error_heu = tvar(MAPE_errors_heu)

            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            # std_dev_ext = math.sqrt(var_error_ext)
            # std_error_ext = std_dev_ext / math.sqrt(sample_size)
            # span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            # span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # heurestic confidence interval
            # std_dev_heu = math.sqrt(var_error_heu)
            # std_error_heu = std_dev_heu / math.sqrt(sample_size)
            # span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            # span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            # var_errors_ext.append(var_error_ext)
            # var_errors_heu.append(var_error_heu)
            
            res_string = "\nResult ALL DATA({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            # res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            # res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            # res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            with open(path + 'log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    if len(avg_errors) > 0:
        total_avg_error = sum(avg_errors)/float(len(avg_errors))
        total_res_string = "Avg error:{}".format(total_avg_error)
Esempio n. 3
0
def cross_validation_compact(transactions, sample_pct=0.50, support=-3, all_frequent_items=None):
    from fim import fpgrowth
    """
    Cross validation. Using compact representation from
    Forward.
    """
    # init
    _id = str(time()).replace('.','')
    # if all_frequent_items is None:
    #     all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3)

    cv_start = time()
    print "\n### Running cross validation {}###".format(_id)
    print "Total transactions:{}".format(len(transactions))
    # print "Total frequest items:{}".format(len(all_frequent_items))

    # run results
    avg_errors = []
    var_errors = []

    # all_triangles, all_triples = filter_items(all_frequent_items)

    for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling

        all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3)
        all_triangles, all_triples = Forward.forward_compact(all_frequent_items)

        # Get triples for estimates
        frequent_items = fpgrowth(chunk, supp=support, min=1, max=3)
        if len(frequent_items) > 0:
            print 'frequent items: {}'.format(len(frequent_items))
        else:
            print 'No frequent items in chunk: {}'.format(index)
            continue
        triangle_tree, triples = Forward.forward_compact(frequent_items)
        print 'triangle roots: {}'.format(len(triangle_tree))

        estimates = []
        observations = []
        abs_errors = []
        max_est = 0
        max_obs = 0

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]
                for n3 in s3_dict.keys():
                    s3, s13, s23, s123 = s3_dict[n3]

                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1)

                    # maxumum estiamte seen (for plotting)
                    max_est = max(max_est, est)

                    # record the estimate
                    estimates.append(est)

                    # from all observed triples get the actual observed number of triples
                    observed = 0
                    if all_triples.has_key((n1, n2, n3)):
                        observed = all_triples[(n1, n2, n3)]

                    # maximum observation of the triple (for plotting)
                    max_obs = max(max_obs, observed)

                    # record the observed
                    observations.append(observed)

                    # record abs error
                    error = abs(obs-est) / float(obs) * 100
                    abs_errors.append(error)


        if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found
            # evaluation
            min_error = min(abs_errors)
            max_error = max(abs_errors)
            avg_error = sum(abs_errors) / float(len(abs_errors))
            avg_errors.append(avg_error)
            var_error = 0
            if len(abs_errors) > 1:
                var_error = tvar(abs_errors) #tvar is the sample variance
            var_errors.append(var_error)

            res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error)
            print res_string
        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    total_avg_error = sum(avg_errors)/float(len(avg_errors))
    total_res_string = "Avg error:{}".format(total_avg_error)
Esempio n. 4
0
def cross_validation(transactions, sample_pct=0.50, support=-3, all_frequent_items=None):
    from fim import fpgrowth
    """
    Cross validation, 'old' version not using compatct
    triangle representation from Forward.
    """
    # init
    _id = str(time()).replace('.','')
    # if all_frequent_items is None:
    #     all_frequent_items = fpgrowth(transactions, supp=support, min=1, max=3)

    cv_start = time()
    print "\n### Running cross validation {}###".format(_id)
    print "Total transactions:{}".format(len(transactions))
    # print "Total frequest items:{}".format(len(all_frequent_items))

    # run results
    avg_errors = []
    var_errors = []

    # all_triangles, all_triples = filter_items(all_frequent_items)

    for chunk, index, rest in chunks(transactions, int(len(transactions) * sample_pct)):# TODO insert proper sampling

        all_frequent_items = fpgrowth(rest, supp=support, min=1, max=3)
        all_triangles, all_triples = Forward.forward(all_frequent_items)

        # Get triples for estimates
        frequent_items = fpgrowth(chunk, supp=support, min=1, max=3)
        if len(frequent_items) > 0:
            print 'frequent items: {}'.format(len(frequent_items))
        else:
            print 'No frequent items in chunk: {}'.format(index)
            continue
        triangles, triples = Forward.forward(frequent_items)
        print 'triangles: {}'.format(len(triangles))

        estimates = []
        observations = []
        abs_errors = []
        max_est = 0
        max_obs = 0

        for (s1, s2, s3, s12, s23, s13, s123) in triangles:

            # if s123[1] != 0:
            #     continue
            # maxent estimate from the sample.
            # Index [1] of the tuples hold the # occurences in the sample
            est = ent.maxent_est_rosa(s1[1], s2[1], s3[1], s12[1], s23[1], s13[1], float(len(transactions)-len(chunk)), num=int(math.log(len(transactions), 2))+1)

            # maxumum estiamte seen (for plotting)
            max_est = max(max_est, est)

            # record the estimate
            estimates.append(est)

            # from all observed triples get the actual observed number of triples
            observed = 0
            if all_triples.has_key(s123[0]):
                observed = all_triples[s123[0]]

            # maximum observation of the triple (for plotting)
            max_obs = max(max_obs, observed)

            # record the observed
            observations.append(observed)

            # record abs error
            error = abs(obs-est) / float(obs) * 100
            abs_errors.append(error)



        if len(abs_errors) > 0: #TODO handle this, probably when nothing has been found
            # evaluation
            min_error = min(abs_errors)
            max_error = max(abs_errors)
            avg_error = sum(abs_errors) / float(len(abs_errors))
            avg_errors.append(avg_error)
            var_error = 0
            if len(abs_errors) > 1:
                var_error = tvar(abs_errors) #tvar is the sample variance
            var_errors.append(var_error)

            # TODO histogram of the average errors. max-ent, extrapolation, heurestic
            # TODO print average error og the average errors to the log.

            res_string = "\nResult:\nSample size:{} min_error:{} max_error:{} avg_error:{} var_error:{}".format(len(chunk), min_error, max_error, avg_errors[-1], var_error)
            print res_string
        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    total_avg_error = sum(avg_errors)/float(len(avg_errors))
    total_res_string = "Avg error:{}".format(total_avg_error)
    return path
Esempio n. 5
0
def cross_validate_disc_version(algorithm, tab_file, min_support=-30, sample_pct=0.1, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions

    # Get the total observed triples
    borgelt_start = time()
    observed_file_name = path + 'observed_frequent_items.out'
    args = [algorithm, tab_file, observed_file_name, '-s' + str(min_support), '-n3']
    # pro = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
    # os.killpg(pro.pid, signal.SIGTERM)
    call(args)
    # sleep(20)
    print 'fpgrowth on all data done: {} secs'.format(time()-borgelt_start)

    freq = Borgelt.read_frequent_items(observed_file_name)

    # Create ds of all observed triplets
    # Saved as sorted keys for lookup,
    # and their frequency as value
    observed = {}
    count = 0
    for item in freq:
        if len(item[0]) == 3:
            sorted_trip = triple_sort(item[0])
            observed[sorted_trip] = item[1][0]
    print 'Total triplets observed:', len(observed)
    average_observed = sum(observed.values()) / float(len(observed))
    print 'Baseline: ', average_observed

    del freq

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    avg_errors_ind = []
    var_errors_ind = []
    avg_errors_baseline = []

    occurrences = [0 for i in range(100)]
    max_ent_acc_error = [0 for i in range(100)]
    ext_acc_error = [0 for i in range(100)]
    ind_acc_error = [0 for i in range(100)]
    heu_acc_error = [0 for i in range(100)]
    baseline_acc_error = [0 for i in range(100)]

    # Record trip counts for the best estimats
    max_ent_best = Counter()
    ext_best = Counter()
    ind_best = Counter()

    for index in range(iterations):

        # Create sample file
        sampling_start = time()
        if sample_pct > 0:
            sample_size= int(total_transactions*sample_pct)
        else:
            sample_size = abs(sample_pct)
        test_data_size = total_transactions - sample_size
        sample = random.sample(range(total_transactions), sample_size)
        assert len(sample) == sample_size, 'Sample size not equal to sample'
        sample.sort()
        sample_file_name = path + str(index) + '_sample.tab'
        with open(sample_file_name, 'a') as sample_file:
            sample_line = 0
            for line_num, line in enumerate(open(tab_file, 'rb')):
                if line_num == sample[sample_line]:
                    sample_file.write(line)
                    sample_line += 1
                    if sample_line == sample_size:
                        break

        del sample
        print 'Sample size: {} time: {}'.format(sample_size, time() - sampling_start)
        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, sample_file_name, sample_freq_name, '-s-1', '-n3']
        call(args)
        print 'fpgrowth on sample data done: {} secs'.format(time()-borgelt_start)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, test_data_size)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        independences = []
        heurestics = []
        baselines = []
        observations = []

        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        MAPE_errors_ind = []
        MAPE_errors_heu = []
        MAPE_errors_baseline = []
        true_errors = []
        pair_triple_ratios = []

        triangle_counts = []

        # s1_list = []
        # s2_list = []
        # s3_list = []
        # s12_list = []
        # s13_list = []
        # s23_list = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2)) + 1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]
                for n3 in s3_dict.keys():
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))

                    triangle = (n1, n2, n3)

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)

                    # Get the obs (test data) frequency minus those found in the sample (training data)
                    obs = 0
                    if triangle in observed:
                         # (triples in data) - (triples in sample). Calculating the number of triples in test data.
                        obs = observed[triangle] - s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(sample_size), num=req_depth) * (test_data_size / float(sample_size))

                    if est < 0:
                        print 'max ent below 0'
                        print 's1 s2 s3 s12 s13 s23 s123', (s1, s2, s3, s12, s23, s13, s123)

                    # extrapolation estimate
                    est2 = s123 / float(sample_size) * test_data_size

                    # independence estimat
                    est3 = (s1 / float(sample_size)) * (s2 / float(sample_size)) * (s3 / float(sample_size)) * test_data_size
                    # est3 = (s1*s2*s3)/float(sample_size*sample_size) * test_data_size/float(sample_size)

                    # heurestic, use max_ent for 0 triple in sample
                    est4 = s123 < 5 and est or est2

                    # base line estimat
                    est5 = average_observed

                    estimates.append(est)
                    extrapolations.append(est2)
                    independences.append(est3)
                    heurestics.append(est4)
                    baselines.append(est5)
                    observations.append(obs)
                    triplets.append(triangle)
                    # TODO Do why save these? They already exist in the triangle tree (and take
                    # up shit load of space..)
                    # s1_list.append(s1)
                    # s2_list.append(s2)
                    # s3_list.append(s3)
                    # s12_list.append(s12)
                    # s13_list.append(s13)
                    # s23_list.append(s23)
                    #end TODO

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs) # * 100
                    MAPE_errors.append(error)
                    true_errors.append(obs-est)

                    # MAPE error extrapolation
                    error2 = 0
                    if est2 > 0:
                        error2 = abs(obs-est2) / math.sqrt(obs) # * 100
                    MAPE_errors_ext.append(error2)

                    # MAPE error independence
                    error3 = abs(obs-est3) / math.sqrt(obs) # * 100
                    MAPE_errors_ind.append(error3)

                    # MAPE error heurestic
                    error4 = abs(obs-est4) / math.sqrt(obs) # * 100
                    MAPE_errors_heu.append(error4)

                    # MAPE baseline error
                    error5 = abs(obs-est5) / math.sqrt(obs) #* 100
                    MAPE_errors_baseline.append(error5)

                    # Record error for the estimeate that performed best
                    if error < error2 and error < error3:
                        max_ent_best[s123] += 1
                    elif error2 < error and error2 < error3:
                        ext_best[s123] += 1
                    else:
                        ind_best[s123] += 1

                    try:
                        occurrences[s123] += 1
                        max_ent_acc_error[s123] += error
                        ext_acc_error[s123] += error2
                        ind_acc_error[s123] += error3
                        heu_acc_error[s123] += error4
                        baseline_acc_error[s123] += error5
                    except IndexError, ie:
                        pass


        # print 'true errors: ', true_errors
        # print 'estimates: ', estimates
        # print 'observed: ', observed
        # print 'mape ', MAPE_errors
        del triangle_tree
        del sample_triples

        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            avg_errors_ext.append(avg_error_ext)

            # independence error
            avg_error_ind = sum(MAPE_errors_ind) / float(len(MAPE_errors_ind))
            avg_errors_ind.append(avg_error_ind)

            # heurestic error
            avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            avg_errors_heu.append(avg_error_heu)

            # baseline error
            avg_error_baseline = sum(MAPE_errors_baseline) / float(len(MAPE_errors_baseline))
            avg_errors_baseline.append(avg_error_baseline)

            var_error = 0
            var_error_ext = 0
            var_error_heu = 0
            var_error_ind = 0
            # variance
            if len(MAPE_errors) > 1:
                var_error = tvar(MAPE_errors) #tvar is the sample variance
                var_error_ext = tvar(MAPE_errors_ext)
                var_error_heu = tvar(MAPE_errors_heu)
                var_error_ind = tvar(MAPE_errors_ind)


            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            std_dev_ext = math.sqrt(var_error_ext)
            std_error_ext = std_dev_ext / math.sqrt(sample_size)
            span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # independence confidence interval
            std_dev_ind = math.sqrt(var_error_ind)
            std_error_ind = std_dev_ind / math.sqrt(sample_size)
            span_99_ind = norm.interval(0.99, avg_error_ind, std_error_ind)
            span_95_ind = norm.interval(0.95, avg_error_ind, std_error_ind)

            # heurestic confidence interval
            std_dev_heu = math.sqrt(var_error_heu)
            std_error_heu = std_dev_heu / math.sqrt(sample_size)
            span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            var_errors_ext.append(var_error_ext)
            var_errors_heu.append(var_error_heu)
            var_errors_ind.append(var_error_ind)

            res_string = "\nResult ({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), total_transactions-sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            res_string += 'avg_error_ind:{} var_error_ind:{}\n'.format(avg_error_ind, var_error_ind)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ind))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ind))

            res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            res_string += 'avg_error_baseline:{}\n'.format(avg_error_baseline)

            with open(path + str(index) + '_log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_heurestic.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(heurestics):
                    fd.write(str(heurestics[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_independece.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(independences):
                    fd.write(str(independences[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')

            # Save the errors
            with open(path + str(index) + '_MAPE_errors.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors, fd)
            with open(path + str(index) + '_MAPE_errors_ext.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ext, fd)
            with open(path + str(index) + '_MAPE_errors_heu.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_heu, fd)
            with open(path + str(index) + '_MAPE_errors_ind.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ind, fd)
            with open(path + str(index) + '_MAPE_errors_baseline.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_baseline, fd)

            #saves amounts of all subsets of triples.
            # TODO this code does not run!
            # with open(path + str(index) + '_data_correlations.tsv', 'w') as fd:
            #     fd.write('s1\ts2\ts3\ts12\ts13\ts23\n')
            #     for _index, i in enumerate(s123):
            #         fd.write(str(s1[_index]) + '\t' + str(s2[_index]) + '\t' + str(s3[_index]) + '\t' + str(s12[_index]) + '\t' + str(s13[_index]) + '\t'+ str(s23[_index]) + '\n')

            #saves independence estimate for all triples.
            # TODO Why s123[_index] in the denominator?
            # TODO What is a 'double independece estimat'?
            # TODO Why not calculate and save estimates in the same way as ext and max_ent?
            # with open(path + str(index) + '_independence_estimate.tsv', 'w') as fd:
            #     fd.write('single independence estimate\tdouble independence estimate\n')
            #     for _index, i in enumerate(s123):
            #     	tempVal1 = sample_size/(s1[_index])
            #     	tempVal2=sample_size/(s2[_index])
            #     	tempVal3=sample_size/(s3[_index])
            #     	tempVal12=sample_size/(s12[_index])
            #     	tempVal13=sample_size/(s13[_index])
            #     	tempVal23=sample_size/(s23[_index])
            #         fd.write(str(s123[_index]/tempVal1*tempVal2*tempVal3*(total_transactions-sample_size) + '\t' + s123[_index]/tempVal12*tempVal13*tempVal23*(total_transactions-sample_size) + '\n'))


            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'