Ejemplo n.º 1
0
def est_all_data_disc_version(algorithm, tab_file, min_support=-30, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation on ALL DATA cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions
    sample_size = total_transactions

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    for index in range(iterations):

        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, tab_file, sample_freq_name, '-s' + str(min_support), '-n3']
        call(args)
        print 'fpgrowth on sample data (ALL DATA) done: {} secs'.format(time()-borgelt_start)


        freq = Borgelt.read_frequent_items(sample_freq_name)
        # Create ds of all observed triplets
        # Saved as sorted keys for lookup,
        # and their frequency as value
        observed = {}
        count = 0
        for item in freq:
            if len(item[0]) == 3:
                sorted_trip = triple_sort(item[0])
                # * 2, horrible hack to make Forward calculated the 
                # observed frequency correctly.
                observed[sorted_trip] = item[1][0] * 2
        print 'Total triplets observed:', len(observed)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, total_transactions)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        heurestics = []
        observations = []
        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        triangle_counts = []
        triplets = []
        pair_triple_ratios = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2))+1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]                                                                                                                                                                                                                          
                for n3 in s3_dict.keys():                                                                                                                                       
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle = (n1, n2, n3)  
                    triplets.append(triangle)

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))   

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)                                                                                                                                                                                                                                                                                                                                                                                                                                   

                    # Observed is the triple support, since sample is all data
                    obs = s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=req_depth)

                    # extrapolation estimate, does not make sense for all data
                    est2 = s123 / float(sample_size) * (total_transactions)

                    # heurestic, use max_ent for 0 triple in sample, does not make sense for all data
                    # est3 = s123 == 0 and est or est2

                    estimates.append(est)
                    # extrapolations.append(est2)
                    # heurestics.append(est3)
                    observations.append(obs)
                    triplets.append(triangle)

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs)
                    MAPE_errors.append(error)
                    # MAPE error extrapolation
                    error2 = abs(obs-est2) / math.sqrt(obs)
                    MAPE_errors_ext.append(error2)
                    # MAPE error heurestic
                    # error3 = abs(obs-est3) / float(obs) * 100
                    # MAPE_errors_heu.append(error3)

        
        del triangle_tree
        del sample_triples
                    
        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            # avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            # avg_errors_ext.append(avg_error_ext)
            
            # heurestic error
            # avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            # avg_errors_heu.append(avg_error_heu)
            
            # variance
            var_error = var(MAPE_errors)
            # var_error_ext = tvar(MAPE_errors_ext)
            # var_error_heu = tvar(MAPE_errors_heu)

            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            # std_dev_ext = math.sqrt(var_error_ext)
            # std_error_ext = std_dev_ext / math.sqrt(sample_size)
            # span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            # span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # heurestic confidence interval
            # std_dev_heu = math.sqrt(var_error_heu)
            # std_error_heu = std_dev_heu / math.sqrt(sample_size)
            # span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            # span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            # var_errors_ext.append(var_error_ext)
            # var_errors_heu.append(var_error_heu)
            
            res_string = "\nResult ALL DATA({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            # res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            # res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            # res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            with open(path + 'log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    if len(avg_errors) > 0:
        total_avg_error = sum(avg_errors)/float(len(avg_errors))
        total_res_string = "Avg error:{}".format(total_avg_error)
Ejemplo n.º 2
0
def cross_validate_disc_version(algorithm, tab_file, min_support=-30, sample_pct=0.1, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions

    # Get the total observed triples
    borgelt_start = time()
    observed_file_name = path + 'observed_frequent_items.out'
    args = [algorithm, tab_file, observed_file_name, '-s' + str(min_support), '-n3']
    # pro = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
    # os.killpg(pro.pid, signal.SIGTERM)
    call(args)
    # sleep(20)
    print 'fpgrowth on all data done: {} secs'.format(time()-borgelt_start)

    freq = Borgelt.read_frequent_items(observed_file_name)

    # Create ds of all observed triplets
    # Saved as sorted keys for lookup,
    # and their frequency as value
    observed = {}
    count = 0
    for item in freq:
        if len(item[0]) == 3:
            sorted_trip = triple_sort(item[0])
            observed[sorted_trip] = item[1][0]
    print 'Total triplets observed:', len(observed)
    average_observed = sum(observed.values()) / float(len(observed))
    print 'Baseline: ', average_observed

    del freq

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    avg_errors_ind = []
    var_errors_ind = []
    avg_errors_baseline = []

    occurrences = [0 for i in range(100)]
    max_ent_acc_error = [0 for i in range(100)]
    ext_acc_error = [0 for i in range(100)]
    ind_acc_error = [0 for i in range(100)]
    heu_acc_error = [0 for i in range(100)]
    baseline_acc_error = [0 for i in range(100)]

    # Record trip counts for the best estimats
    max_ent_best = Counter()
    ext_best = Counter()
    ind_best = Counter()

    for index in range(iterations):

        # Create sample file
        sampling_start = time()
        if sample_pct > 0:
            sample_size= int(total_transactions*sample_pct)
        else:
            sample_size = abs(sample_pct)
        test_data_size = total_transactions - sample_size
        sample = random.sample(range(total_transactions), sample_size)
        assert len(sample) == sample_size, 'Sample size not equal to sample'
        sample.sort()
        sample_file_name = path + str(index) + '_sample.tab'
        with open(sample_file_name, 'a') as sample_file:
            sample_line = 0
            for line_num, line in enumerate(open(tab_file, 'rb')):
                if line_num == sample[sample_line]:
                    sample_file.write(line)
                    sample_line += 1
                    if sample_line == sample_size:
                        break

        del sample
        print 'Sample size: {} time: {}'.format(sample_size, time() - sampling_start)
        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, sample_file_name, sample_freq_name, '-s-1', '-n3']
        call(args)
        print 'fpgrowth on sample data done: {} secs'.format(time()-borgelt_start)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, test_data_size)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        independences = []
        heurestics = []
        baselines = []
        observations = []

        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        MAPE_errors_ind = []
        MAPE_errors_heu = []
        MAPE_errors_baseline = []
        true_errors = []
        pair_triple_ratios = []

        triangle_counts = []

        # s1_list = []
        # s2_list = []
        # s3_list = []
        # s12_list = []
        # s13_list = []
        # s23_list = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2)) + 1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]
                for n3 in s3_dict.keys():
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))

                    triangle = (n1, n2, n3)

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)

                    # Get the obs (test data) frequency minus those found in the sample (training data)
                    obs = 0
                    if triangle in observed:
                         # (triples in data) - (triples in sample). Calculating the number of triples in test data.
                        obs = observed[triangle] - s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(sample_size), num=req_depth) * (test_data_size / float(sample_size))

                    if est < 0:
                        print 'max ent below 0'
                        print 's1 s2 s3 s12 s13 s23 s123', (s1, s2, s3, s12, s23, s13, s123)

                    # extrapolation estimate
                    est2 = s123 / float(sample_size) * test_data_size

                    # independence estimat
                    est3 = (s1 / float(sample_size)) * (s2 / float(sample_size)) * (s3 / float(sample_size)) * test_data_size
                    # est3 = (s1*s2*s3)/float(sample_size*sample_size) * test_data_size/float(sample_size)

                    # heurestic, use max_ent for 0 triple in sample
                    est4 = s123 < 5 and est or est2

                    # base line estimat
                    est5 = average_observed

                    estimates.append(est)
                    extrapolations.append(est2)
                    independences.append(est3)
                    heurestics.append(est4)
                    baselines.append(est5)
                    observations.append(obs)
                    triplets.append(triangle)
                    # TODO Do why save these? They already exist in the triangle tree (and take
                    # up shit load of space..)
                    # s1_list.append(s1)
                    # s2_list.append(s2)
                    # s3_list.append(s3)
                    # s12_list.append(s12)
                    # s13_list.append(s13)
                    # s23_list.append(s23)
                    #end TODO

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs) # * 100
                    MAPE_errors.append(error)
                    true_errors.append(obs-est)

                    # MAPE error extrapolation
                    error2 = 0
                    if est2 > 0:
                        error2 = abs(obs-est2) / math.sqrt(obs) # * 100
                    MAPE_errors_ext.append(error2)

                    # MAPE error independence
                    error3 = abs(obs-est3) / math.sqrt(obs) # * 100
                    MAPE_errors_ind.append(error3)

                    # MAPE error heurestic
                    error4 = abs(obs-est4) / math.sqrt(obs) # * 100
                    MAPE_errors_heu.append(error4)

                    # MAPE baseline error
                    error5 = abs(obs-est5) / math.sqrt(obs) #* 100
                    MAPE_errors_baseline.append(error5)

                    # Record error for the estimeate that performed best
                    if error < error2 and error < error3:
                        max_ent_best[s123] += 1
                    elif error2 < error and error2 < error3:
                        ext_best[s123] += 1
                    else:
                        ind_best[s123] += 1

                    try:
                        occurrences[s123] += 1
                        max_ent_acc_error[s123] += error
                        ext_acc_error[s123] += error2
                        ind_acc_error[s123] += error3
                        heu_acc_error[s123] += error4
                        baseline_acc_error[s123] += error5
                    except IndexError, ie:
                        pass


        # print 'true errors: ', true_errors
        # print 'estimates: ', estimates
        # print 'observed: ', observed
        # print 'mape ', MAPE_errors
        del triangle_tree
        del sample_triples

        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            avg_errors_ext.append(avg_error_ext)

            # independence error
            avg_error_ind = sum(MAPE_errors_ind) / float(len(MAPE_errors_ind))
            avg_errors_ind.append(avg_error_ind)

            # heurestic error
            avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            avg_errors_heu.append(avg_error_heu)

            # baseline error
            avg_error_baseline = sum(MAPE_errors_baseline) / float(len(MAPE_errors_baseline))
            avg_errors_baseline.append(avg_error_baseline)

            var_error = 0
            var_error_ext = 0
            var_error_heu = 0
            var_error_ind = 0
            # variance
            if len(MAPE_errors) > 1:
                var_error = tvar(MAPE_errors) #tvar is the sample variance
                var_error_ext = tvar(MAPE_errors_ext)
                var_error_heu = tvar(MAPE_errors_heu)
                var_error_ind = tvar(MAPE_errors_ind)


            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            std_dev_ext = math.sqrt(var_error_ext)
            std_error_ext = std_dev_ext / math.sqrt(sample_size)
            span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # independence confidence interval
            std_dev_ind = math.sqrt(var_error_ind)
            std_error_ind = std_dev_ind / math.sqrt(sample_size)
            span_99_ind = norm.interval(0.99, avg_error_ind, std_error_ind)
            span_95_ind = norm.interval(0.95, avg_error_ind, std_error_ind)

            # heurestic confidence interval
            std_dev_heu = math.sqrt(var_error_heu)
            std_error_heu = std_dev_heu / math.sqrt(sample_size)
            span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            var_errors_ext.append(var_error_ext)
            var_errors_heu.append(var_error_heu)
            var_errors_ind.append(var_error_ind)

            res_string = "\nResult ({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), total_transactions-sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            res_string += 'avg_error_ind:{} var_error_ind:{}\n'.format(avg_error_ind, var_error_ind)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ind))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ind))

            res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            res_string += 'avg_error_baseline:{}\n'.format(avg_error_baseline)

            with open(path + str(index) + '_log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_heurestic.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(heurestics):
                    fd.write(str(heurestics[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_independece.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(independences):
                    fd.write(str(independences[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')

            # Save the errors
            with open(path + str(index) + '_MAPE_errors.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors, fd)
            with open(path + str(index) + '_MAPE_errors_ext.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ext, fd)
            with open(path + str(index) + '_MAPE_errors_heu.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_heu, fd)
            with open(path + str(index) + '_MAPE_errors_ind.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ind, fd)
            with open(path + str(index) + '_MAPE_errors_baseline.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_baseline, fd)

            #saves amounts of all subsets of triples.
            # TODO this code does not run!
            # with open(path + str(index) + '_data_correlations.tsv', 'w') as fd:
            #     fd.write('s1\ts2\ts3\ts12\ts13\ts23\n')
            #     for _index, i in enumerate(s123):
            #         fd.write(str(s1[_index]) + '\t' + str(s2[_index]) + '\t' + str(s3[_index]) + '\t' + str(s12[_index]) + '\t' + str(s13[_index]) + '\t'+ str(s23[_index]) + '\n')

            #saves independence estimate for all triples.
            # TODO Why s123[_index] in the denominator?
            # TODO What is a 'double independece estimat'?
            # TODO Why not calculate and save estimates in the same way as ext and max_ent?
            # with open(path + str(index) + '_independence_estimate.tsv', 'w') as fd:
            #     fd.write('single independence estimate\tdouble independence estimate\n')
            #     for _index, i in enumerate(s123):
            #     	tempVal1 = sample_size/(s1[_index])
            #     	tempVal2=sample_size/(s2[_index])
            #     	tempVal3=sample_size/(s3[_index])
            #     	tempVal12=sample_size/(s12[_index])
            #     	tempVal13=sample_size/(s13[_index])
            #     	tempVal23=sample_size/(s23[_index])
            #         fd.write(str(s123[_index]/tempVal1*tempVal2*tempVal3*(total_transactions-sample_size) + '\t' + s123[_index]/tempVal12*tempVal13*tempVal23*(total_transactions-sample_size) + '\n'))


            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'