Ejemplo n.º 1
0
def create_zero_trip_files(folder):
    import os
    from parsers import CVOutputParser
    """
    Iterate .tsv file and only write estimates where the triple was 0 in the sample
    to another file.
    """
    iteration = 0
    while True:
        max_ent_file = folder + str(iteration) + '_data.tsv'
        
        if not os.path.exists(max_ent_file):
            break

        # Cleaned file name
        max_ent_zero_trips_file = folder + str(iteration) + '_data_zero_trips.tsv'
        fd = open(max_ent_zero_trips_file, 'w')

        # write header
        fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')

        # Clean
        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file):

            s1, s2, s3, s12, s13, s23, s123 = triangle

            if s123 != 0:
                continue

            fd.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')

        fd.close()

        iteration += 1
Ejemplo n.º 2
0
def error_averages_for_triple_counts(output_folder):
    """
    Accumulated errors against triple count in sample on a CV result
    """
    from parsers import CVOutputParser
    from utils import interpolate, avg
    import math
    from collections import Counter
    import os
    """ 
    Average error calculation on CV output.
    """
    if not output_folder[-1] == '/':
        output_folder += '/'

    baseline = 88.5
    #max ent
    iteration = 0
    max_ent_acc_errors = [0 for x in range(100000)]
    baseline_acc_errors = [0 for x in range(100000)]
    occurrences = [0 for x in range(100000)]
    while True:
        tsv_file = output_folder + str(iteration) + '_data.tsv'

        if not os.path.exists(tsv_file):
            break

        for (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version(tsv_file):

            s1, s2, s3, s12, s13, s23, s123 = triangle

            # if obs < 200:
            #     continue

            try:
                occurrences[int(obs)] += 1
                max_ent_acc_errors[int(obs)] += abs(est-obs) / math.sqrt(obs)
                baseline_acc_errors[int(obs)] += abs(baseline-obs) / math.sqrt(obs)
            except IndexError, e:
                pass

        iteration += 1
        print 'iteration: ', iteration
Ejemplo n.º 3
0
def calc_avg_errors(output_folder):

    from parsers import CVOutputParser
    from utils import interpolate, avg
    import math
    from collections import Counter
    import os
    """ 
    Average error calculation on CV output.
    """
    if not output_folder[-1] == '/':
        output_folder += '/'
    
    # better_than_baseline_file = open('better_than_base_line.tsv', 'w')
    # better_than_baseline_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')

    # small_error_file = open('small_error.tsv', 'w')
    # small_error_file.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')    
    baseline = 88.5
    iteration = 0
    points_evaluated = 0
    over_estimates = 0
    all_sample_errors = []
    while True:
        tsv_file = output_folder + str(iteration) + '_data_zero_trips.tsv'

        if not os.path.exists(tsv_file):
            break

        sample_erros = []
        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(tsv_file):

            s1, s2, s3, s12, s13, s23, s123 = triangle

            # if int(obs) < 200 or s123 == 0:
            #     continue

            # Heurestiv for extrapolation, 200000 in sample
            # est = min(s12, s13, s23) / 200000. * (21006480-200000)

            points_evaluated += 1
            if est > obs:
                over_estimates += 1

            # if obs > baseline:
            #     if abs(est-obs) < abs(est-baseline):
            #         better_than_baseline_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')

            error = abs(est-obs) / math.sqrt(obs)
            # if error < 3:
            #     small_error_file.write(str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')
            sample_erros.append(error)
        all_sample_errors.append(avg(sample_erros))
        iteration += 1

    # better_than_baseline_file.close()
    # small_error_file.close()

    avg_error = avg(all_sample_errors)
    print 'avg_error ', avg_error
    print 'points evaluated', points_evaluated
    print 'over estimates: ', over_estimates
    return avg_error, all_sample_errors
Ejemplo n.º 4
0
            except IndexError, e:
                pass

        iteration += 1
        print 'iteration: ', iteration

    # extrapolation
    ext_acc_errors = [0 for x in range(100000)]
    iteration = 0
    while True:
        tsv_file = output_folder + str(iteration) + '_data_extrapolation.tsv'

        if not os.path.exists(tsv_file):
            break

        for (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version(tsv_file):

            s1, s2, s3, s12, s13, s23, s123 = triangle

            # if obs < 200:
            #     continue

            try:
                ext_acc_errors[int(obs)] += abs(est-obs) / math.sqrt(obs)
            except IndexError, e:
                pass

        iteration += 1
        print 'iteration: ', iteration

    for i, count in enumerate(occurrences):
Ejemplo n.º 5
0
def merge_sample(folder):
    import os
    from parsers import CVOutputParser
    """
    Creates a single a single .tsv file with maxent and extrapolation
    results.
    """

    iteration = 0
    maxent_estimates = []
    while True:
        max_ent_file = folder + str(iteration) + '_data.tsv'
        
        if not os.path.exists(max_ent_file):
            break

        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(max_ent_file):
            s1, s2, s3, s12, s13, s23, s123 = triangle
            maxent_estimates.append(est)
        iteration += 1
        print 'iteration ', iteration


    # merged file name
    merged_file = folder + 'merged_estimates.tsv'
    fd = open(merged_file, 'wr')
    # write header
    fd.write('est\text\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')

    iteration = 0
    estimate_number = 0
    while True:
        ext_file = folder + str(iteration) + '_data_extrapolation.tsv'
        
        if not os.path.exists(ext_file):
            break

        for (n1, n2, n3), (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_file):
            s1, s2, s3, s12, s13, s23, s123 = triangle
            fd.write(str(maxent_estimates[estimate_number]) + '\t' + str(est) + '\t' + str(obs) + '\t' + str(n1) + '\t' + str(n2) + '\t' + str(n3) + '\t' + str(ratio) + '\t' + str(s1) + '\t' + str(s2) + '\t' + str(s3) + '\t' + str(s12) + '\t' + str(s13) + '\t' + str(s23) + '\t' + str(s123) + '\n')
            estimate_number += 1

        iteration += 1
        print 'iteration ', iteration
    fd.close()
    print 'merging files done'


# def confidence_interval():
    # in some sample, we calculate the % errors.
    # This gives some distribution
    # calculate errors as a percentage, currently absolute errors
    # have the issue of big estimates weigh too much
    # cross validate max ent estimate on som subset of triplets
    # how good/bad is this estiamte? Confidence interval,
    # ie with 95% percent change our error mean is only x std. from true
    # error mean.

    # Can this confidence tell us anything about how well a sample will work
    # ie how good estiamtes are?

    # Could we calcualte this for both max_ent and extrapolation and 
    # find some threshold there?

    # Varianse in the data. The distribution of the sample has some variance
    # ie frequnecy on item counts, can this be related to the error?



# def test_triple_sort():
#     res = (1, 2, 3)
#     assert triple_sort((1, 2, 3)) == res
#     assert triple_sort((1, 3, 2)) == res
#     assert triple_sort((2, 1, 3)) == res
#     assert triple_sort((2, 3, 1)) == res, triple_sort((2, 3, 1))
#     assert triple_sort((3, 2, 1)) == res
#     assert triple_sort((3, 1, 2)) == res

#     res = (1,1,3)
#     assert triple_sort((1, 1, 3)) == res
#     assert triple_sort((1, 3, 1)) == res
#     assert triple_sort((1, 1, 3)) == res
#     assert triple_sort((1, 3, 1)) == res
#     assert triple_sort((3, 1, 1)) == res
#     assert triple_sort((3, 1, 1)) == res

# def test_chunks():
#     l = [1,2,3,4,5,6]
#     i=0
#     for chunk, index, rest in chunks(l, 2):
#         assert len(chunk) == 2, (chunk, rest)
#         assert not chunk in rest, (chunk, rest)
#         assert index == i, (index, i)
#         i += 1

#     l = [1,2,3,4,5]
#     i = 0
#     for chunk, index, rest in chunks(l, 2):
#         assert len(chunk) <= 2, (chunk, rest)
#         assert not chunk in rest, (chunk, rest)
#         assert index == i, (index, i)
#         i += 1

#     l = [1,2,3,4,5]
#     i = 0
#     for chunk, index, rest in chunks(l, 3):
#         assert len(chunk) <= 3, (chunk, rest)
#         assert not chunk in rest, (chunk, rest)
#         assert index == i, (index, i)
#         i += 1

# test_chunks()
Ejemplo n.º 6
0
def triple_errors(output_folder, triple):
    from parsers import CVOutputParser
    from utils import interpolate, avg, confidence_interval
    import math
    from collections import Counter
    import os

    """ 
    Plot accumulated errors for estimators against pair triple ratios.
    Ratios are binned in the range 0.0 to 1.0.
    """
    if not output_folder[-1] == "/":
        output_folder += "/"

    iteration = -1
    max_ent_errors = []
    ext_errors = []
    max_ent_abs_errors = []
    ext_abs_errors = []
    samples_ignored = 0
    while True:
        iteration += 1
        max_ent_est_file = output_folder + str(iteration) + "_data.tsv"
        ext_est_file = output_folder + str(iteration) + "_data_extrapolation.tsv"
        # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv'
        # read baseline also?
        # Read until we do not find an output file
        if not os.path.exists(max_ent_est_file):
            break

        # Read the maxent estimate
        found = False
        for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(
            max_ent_est_file
        ):
            (s1, s2, s3, s12, s13, s23, s123) = triangle

            if sample_triple == triple:
                # if s123 == 0:
                #     break
                found = True
                max_ent_errors.append(est - obs)
                max_ent_abs_errors.append(abs(obs - est))
                break

        if not found:
            samples_ignored += 1
            continue

        for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_est_file):
            (s1, s2, s3, s12, s13, s23, s123) = triangle

            if sample_triple == triple:
                ext_errors.append(est - obs)
                ext_abs_errors.append(abs(obs - est))
                break

    # maxent confidence interval
    maxent_ci = confidence_interval(max_ent_errors)
    # extrapolation confidence interval
    ext_ci = confidence_interval(ext_errors)

    print "samples ignored: ", samples_ignored
    print "maxent avg error: ", round(avg(max_ent_errors), 1)
    print "maxent 95% confidence interval: ", (round(maxent_ci[0], 1), round(maxent_ci[1], 2))
    print "extrapolation avg error: ", round(avg(ext_errors), 1)
    print "extrapolation 95% confidence interval: ", (round(ext_ci[0], 1), round(ext_ci[1], 2))

    # round
    max_ent_errors_rounded = [round(x, 1) for x in max_ent_errors]
    ext_errors_rounded = [round(x, 1) for x in ext_errors]

    # plot
    xlabel("Estimate error")
    ylabel("Bucket size")
    # text(0.1, 0.8, 'Maxent')
    # text(0.1, 0.7, 'avg. error: ' + str(avg(max_ent_errors)))
    # text(0.1, 0.6, '95% conf. interval: ' + str(maxent_ci))

    # text(0.5, 0.8, 'Extrapolation')
    # text(0.5, 0.7, 'avg. error: ' + str(avg(ext_errors)))
    # text(0.5, 0.6, '95% conf. interval: ' + str(ext_ci))

    hist([max_ent_errors_rounded, ext_errors_rounded], color=("b", "r"))

    return max_ent_errors, max_ent_abs_errors, ext_errors, ext_abs_errors
Ejemplo n.º 7
0
def plot_intervals(output_folder):
    from parsers import CVOutputParser
    from preprocessing import Preprocessor
    from utils import avg
    import os
    import math
    """ 
    Given a cross validation ouput. Certain triple intervals can be plottet
    to compare the error for extrapolation, max ent and the heurestic.
    
    The algorithm runs through each triple interval, and then for each sampled estiamte output
    the triples in the interval are looked up in each sample and the MAPE error is 
    recorded and the average errors are added. And the average of these averages
    are then plottet for each interval.

    """
    if not output_folder[-1] == '/':
        output_folder += '/'
    intervals = 30
    triple_intervals = Preprocessor.triple_intervals(output_folder + 'observed_frequent_items.out', intervals=intervals)

    avg_max_ent_errors = []
    avg_ext_errors = []
    avg_heu_errors = []
    pair_triple_ratios = [i/10. for i in range(11)] # binned ratios [0.0 to 1.0]
    max_ent_ratio_error = [0 for i in range(11)]
    ext_ratio_error = [0 for i in range(11)]

    for index, triple_interval in enumerate(triple_intervals):
        print 'Triple interval {} of {}'.format(index, intervals)
        iteration = 0
        MAPE_avg_errors = []
        MAPE_avg_errors_ext = []
        # MAPE_avg_errors_heu = []
        while True:
            max_ent_est_file = output_folder + str(iteration) + '_data.tsv'
            ext_est_file = output_folder + str(iteration) + '_data_extrapolation.tsv'
            # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv'
            # read baseline also?
            # Read until we do not find an output file
            if not os.path.exists(max_ent_est_file):
                break

            max_ent_est = CVOutputParser.read_est_obs_file(max_ent_est_file)
            ext_est = CVOutputParser.read_est_obs_file(ext_est_file)
            # heu_est = CVOutputParser.read_est_obs_file(heu_est_file)

            MAPE_errors = []
            MAPE_errors_ext = []
            # MAPE_errors_heu = []

            for triple in triple_interval:
                # Check that the triple has been estimated
                if triple in max_ent_est:

                    # Index 1 should hold the observed value parsed from the file
                    # is the same mapped to every estimate, so hust read it once.
                    obs = max_ent_est[triple][1]

                    # maxent estimate
                    est = max_ent_est[triple][0]

                    # extrapolation estimate
                    est2 = ext_est[triple][0]

                    # # independence estimat?

                    # heurestic, use max_ent for 0 triple in sample
                    # est4 = heu_est[triple][0]

                    # Index 2 should hold the pair triple ratio.
                    # is the sam for every estimat
                    ratio = max_ent_est[triple][2]
                    # bin the ratio to one decimal
                    ratio_binned = round(ratio, 1)
                    # add errors to the ratio
                    max_ent_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est-obs) / float(obs)
                    ext_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est2-obs) / float(obs)


                    # MAPE error max ent
                    # error = abs(obs-est) #/ float(obs) * 100
                    # MAPE_errors.append(error)

                    # # MAPE error extrapolation
                    # error2 = abs(obs-est2) #/ float(obs) * 100
                    # MAPE_errors_ext.append(error2)

                    # MAPE error independence?

                    # MAPE error heurestic
                    # error4 = abs(obs-est4) #/ float(obs) * 100
                    # MAPE_errors_heu.append(error4)

                    

                    # MAPE baseline error?
            MAPE_avg_errors.append(avg(MAPE_errors))
            MAPE_avg_errors_ext.append(avg(MAPE_errors_ext))
            # MAPE_avg_errors_heu.append(avg(MAPE_errors_heu))
            iteration += 1

        avg_max_ent_errors.append(avg(MAPE_avg_errors))
        avg_ext_errors.append(avg(MAPE_avg_errors_ext))
        # avg_heu_errors.append(avg(MAPE_avg_errors_heu))
        

    plot(range(len(avg_max_ent_errors)), avg_max_ent_errors, color='blue')
    plot(range(len(avg_ext_errors)), avg_ext_errors, color='red')
Ejemplo n.º 8
0
def plot_ratios(output_folder):
    from parsers import CVOutputParser
    from utils import interpolate
    import math
    from collections import Counter
    import os
    """
    Plot accumulated errors for estimators agains pari triple ratios.
    Ratios are binned in the range 0.0 to 1.0.
    """
    if not output_folder[-1] == '/':
        output_folder += '/'

    pair_triple_ratios = [i/10. for i in range(11)]
    max_ent_ratio_error = [0 for i in range(11)]
    ext_ratio_error = [0 for i in range(11)]
    maxent_better_ratio = [0 for i in range(11)]
    ext_better_ratio = [0 for i in range(11)]
    values_binned = 0
    values_ignored = 0
    iteration = 0
    pair_counts = Counter()
    trip_counts = Counter()
    while True:
        max_ent_est_file = output_folder + str(iteration) + '_data.tsv'
        ext_est_file = output_folder + str(iteration) + '_data_extrapolation.tsv'
        # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv'
        # read baseline also?
        # Read until we do not find an output file
        if not os.path.exists(max_ent_est_file):
            break

        max_ent_est = CVOutputParser.read_est_obs_file(max_ent_est_file)
        ext_est = CVOutputParser.read_est_obs_file(ext_est_file)
        # heu_est = CVOutputParser.read_est_obs_file(heu_est_file)

        for triple in max_ent_est.keys():

            (s1, s2, s3, s12, s13, s23, s123) = max_ent_est[triple][3]
            pair_counts[s12] += 1
            pair_counts[s13] += 1
            pair_counts[s23] += 1
            trip_counts[s123] += 1



            # if not s123 < max_trips or not min(s12, s13, s23) > min_pairs:
            #     values_ignored += 1
            #     continue
            # Index 1 should hold the observed value parsed from the file
            # is the same mapped to every estimate, so just read it once.
            obs = max_ent_est[triple][1]

            # if obs < 200:
            #     values_ignored += 1
            #     continue

            if obs < 200:
                continue

            # maxent estimate
            est = max_ent_est[triple][0]

            # extrapolation estimate
            est2 = ext_est[triple][0]

            # # independence estimat?

            # heurestic, use max_ent for 0 triple in sample
            # est4 = heu_est[triple][0]

            # Index 2 should hold the pair triple ratio.
            # is the sam for every estimat
            ratio = max_ent_est[triple][2]
            # bin the ratio to one decimal
            ratio_binned = round(ratio, 1)

            # Record the ratio if maxent was better
            maxent_error = abs(est-obs)/math.sqrt(obs)
            ext_error = abs(est2-obs)/math.sqrt(obs)

            try:
                if maxent_error < ext_error:
                    maxent_better_ratio[pair_triple_ratios.index(ratio_binned)] +=1
                elif maxent_error > ext_error:
                    ext_better_ratio[pair_triple_ratios.index(ratio_binned)] +=1
            except ValueError, ve:
                pass

            # add errors to the ratio bin
            try:
                values_binned += 1
                max_ent_ratio_error[pair_triple_ratios.index(ratio_binned)] += maxent_error
                ext_ratio_error[pair_triple_ratios.index(ratio_binned)] += ext_error
            except ValueError, ve:
                pass
Ejemplo n.º 9
0
def error_ratios_cross_val(output_folder):
    """
    Cross validation on the error ratios to find optimal
    triangle values
    """

    from parsers import CVOutputParser
    from utils import avg

    if not output_folder[-1] == '/':
        output_folder += '/'


    singleton_thresholds = [0, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000]
    pair_thresholds = [0, 1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]
    triple_thresholds = [0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250]
    # Results are inserted at an offset
    # acc_error, count, maxent_best, ext_best
    c = [[[(0,0,0,0, (0,0,0)) for z in range(len(triple_thresholds))] for y in range(len(pair_thresholds))] for x in range(len(singleton_thresholds))]


    merged_file = output_folder + 'merged_estimates.tsv'

    iteration = 0
    for (n1, n2, n3), (est, ext, obs, ratio, triangle) in CVOutputParser.read_merged_file_disc_version(merged_file):

        s1, s2, s3, s12, s13, s23, s123 = triangle

        # Calculate errors and add the to the result matrix
        # Ratio error between estiamtes
        error = 0
        # check if both estimaters are spot on:
        if abs(ext-obs) == 0 and abs(est-obs) == 0:
            error = 1.
        # check that we are not dividing be a very small floating point
        # from extrapolation. If below one we just treat the error as
        # if it was 1
        if abs(ext-obs) < 1:
            error = float(abs(est-obs))
        # Get error ratio, avoid division by zero
        elif abs(ext-obs) != 0:
            error = abs(est-obs) / float(abs(ext-obs))
        # ratio_errors.append(error)
        for singleton_index, singleton_threshold in enumerate(singleton_thresholds):
            if not min(s1, s2, s3) > singleton_threshold:
                break
            for pair_index, pair_threshold in enumerate(pair_thresholds):
                if not min(s12, s13, s23) > pair_threshold:
                    break
                for triple_index, triple_threshold in enumerate(triple_thresholds):
                    if not s123 < triple_threshold:
                        continue
                    acc_error, count, maxent_best, ext_best, (s, p, t) = c[singleton_index][pair_index][triple_index]
                    acc_error += error
                    count += 1
                    if error < 1:
                        maxent_best += 1
                    elif error > 1:
                        ext_best += 1
                    c[singleton_index][pair_index][triple_index] = (acc_error, count, maxent_best, ext_best, (singleton_threshold, pair_threshold, triple_threshold))
        if iteration % 1000000 == 0:
            print 'iteration: ', iteration
        iteration += 1

        # maxent_errors.append(est / float(obs))
        # ext_errors.append(ext / float(obs))

    # Compute average errors
    for singleton_index, singleton_threshold in enumerate(singleton_thresholds):
        for pair_index, pair_threshold in enumerate(pair_thresholds):
            for triple_index, triple_threshold in enumerate(triple_thresholds):
                (acc_error, count, maxent_best, ext_best, (s,p,t)) = c[singleton_index][pair_index][triple_index]
                if count > 0:
                    c[singleton_index][pair_index][triple_index] = (acc_error / float(count), count, maxent_best, ext_best, (s,p,t))

    # ratio_error = sum(ratio_errors) / float(len(ratio_errors))
    # ext_ratio = avg(ext_errors)
    # maxent_ratio = avg(maxent_errors)

    # print 'Singletons done for threshold: ', singleton_threshold

    # fd.close()

    # fd = open(output_folder + 'parameter_cv.tsv', 'wr')
    # fd.write('singleton\tpair\ttriple\tmax_ent\text\tratio_error\n')
    # fd.write(singleton + '\t' + pair + '\t' + triple + '\t' + maxent_ratio + '\t' + ext_ratio + '\t' + ratio_error + '\n')
    # max_val = 1000
    # offset = 30
    # hist([x for x in range(max_val)[offset:]], ratio_errors[offset:max_val], color='green')

    return c
Ejemplo n.º 10
0
def error_ratios(output_folder, s_min=None, p_min=None, t_max=None, obs_min=None):
    """
    Error ratio against triple count in sample on a CV result.
    Needs the merged_estimates.tsv file that can be created
    with the relevant script in utils.py
    """

    from parsers import CVOutputParser
    from utils import interpolate, avg
    import math
    from collections import Counter
    import os

    if not output_folder[-1] == '/':
        output_folder += '/'

    max_singleton_occurrence = -1
    max_pair_occurrence = -1
    max_triple_occurrence = -1
    #max ent
    occurrence_ratio_errors = [0 for x in range(100000)]
    ratio_errors = []
    occurrences = [0 for x in range(100000)]
    merged_file = output_folder + 'merged_estimates.tsv'
    maxent_errors = []
    ext_errors = []
    iteration = 0
    maxent_was_best_estimates = []
    ext_was_best = []
    for (n1, n2, n3), (est, ext, obs, ratio, triangle) in CVOutputParser.read_merged_file_disc_version(merged_file):

        s1, s2, s3, s12, s13, s23, s123 = triangle

        iteration += 1
        if iteration % 1000000 == 0:
            print 'iteration: ', iteration

        if not s_min is None:
            if not min(s1,s2,s3) > s_min:
                continue
        if not p_min is None:
            if not min(s12,s23,s13) > p_min:
                continue
        if not t_max is None:
            if not s123 < t_max:
                continue

        if not obs_min is None:
            if not obs > obs_min:
                continue



        if max(s1,s2,s3) > max_singleton_occurrence:
            max_singleton_occurrence = max(s1,s2,s3)
        if max(s12,s13,s23) > max_pair_occurrence:
            max_pair_occurrence = max(s12,s13,s23)
        if s123 > max_triple_occurrence:
            max_triple_occurrence = s123

        # get the absolute errors, 
        # if this is below one we 
        # set it to one to avoid problems
        # with dividing with numbers < 1
        abs_ext_obs = abs(ext-obs)
        if abs_ext_obs < 1:
            abs_ext_obs = 1
        abs_est_obs = abs(est-obs)
        if abs_est_obs < 1:
            abs_est_obs = 1

        error = math.log(abs_est_obs / abs_ext_obs)
        ratio_errors.append(error)

        # low max ent estimate, magic numer is the value for the estiamtes
        # when a pair value was 1 for maxent, or 1 for ext
        # if est <= 104.0324:
        if error < 0:
            maxent_was_best_estimates.append(((n1, n2, n3), (est, ext, obs, ratio, triangle)))
        elif error > 0:
            ext_was_best.append(((n1, n2, n3), (est, ext, obs, ratio, triangle)))


        maxent_errors.append(est / float(obs))
        ext_errors.append(ext / float(obs))
        try:
            occurrences[int(obs)] += 1
            occurrence_ratio_errors[int(obs)] += error
        except IndexError, e:
            pass