Ejemplo n.º 1
0
def math_diff(thresh_dict, inputfile1, inputfile2, abs_diff_file,
              rel_diff_file, err_file, summary_csv):
    # Test for existence of input files
    if not os.path.exists(inputfile1):
        info('unable to open file <%s>' % inputfile1, err_file)
        return 'unable to open file <%s>' % inputfile1, 0, 0, 0
    if not os.path.exists(inputfile2):
        info('unable to open file <%s>' % inputfile2, err_file)
        return 'unable to open file <%s>' % inputfile2, 0, 0, 0

    # read data out of files
    try:
        mat1 = mycsv.getlist(inputfile1)
    except IndexError:
        return 'malformed or empty csv file: <%s>' % inputfile1, 0, 0, 0
    if len(mat1) < 2:
        info('<%s> has no data' % inputfile1, err_file)
        return '<%s> has no data' % inputfile1, 0, 0, 0
    try:
        mat2 = mycsv.getlist(inputfile2)
    except IndexError:
        return 'malformed or empty csv file: <%s>' % inputfile2, 0, 0, 0
    if len(mat2) < 2:
        info('<%s> has no data' % inputfile2, err_file)
        return '<%s> has no data' % inputfile2, 0, 0, 0

    # clean up the files
    matrix1 = fill_matrix_holes(mat1)
    matrix2 = fill_matrix_holes(mat2)

    # split out the time columns
    time1, mat1 = slicetime(matrix1)
    time2, mat2 = slicetime(matrix2)
    # Not going to compare two files with different time series
    if time1 != time2:
        info(
            'Time series in <%s> and <%s> do not match' %
            (inputfile1, inputfile2), err_file)
        return 'Time series do not match', 0, 0, 0

    # Only going to compare fields that are found in both files
    hset1 = set(mat1[0])
    hset2 = set(mat2[0])
    hset = hset1.intersection(hset2)
    if len(hset) == 0:
        info(
            'Input files <%s> and <%s> have no common fields' %
            (inputfile1, inputfile2), err_file)
        return 'No common fields', 0, 0, 0

    # Order will be order in which intersection fields appear in first file
    horder = [h for h in mat1[0] if h in hset]

    # Warn about fields that will not be compared
    hset_sdiff = hset1.symmetric_difference(hset2)
    for h in hset_sdiff:
        if h in hset1:
            mycsv.writecsv([[
                'Not comparing field %s, which appears in input files <%s>, but not <%s>'
                % (h, inputfile1, inputfile2)
            ]], err_file, 'a')
        else:
            mycsv.writecsv([[
                'Not comparing field %s, which appears in input files <%s>, but not <%s>'
                % (h, inputfile2, inputfile1)
            ]], err_file, 'a')

    # convert time matrix to dictionary (both time matrices should be identical here)
    tdict = matrix2hdict(time1)
    tkey = list(tdict.keys())[0]

    # convert data matrices to dictionaries
    hdict1 = matrix2hdict(mat1)
    hdict2 = matrix2hdict(mat2)

    # Dictionaries of absolute and relative differences
    abs_diffs = {}
    rel_diffs = {}
    for key in horder:
        abs_diffs[key] = list(map(abs_diff, hdict1[key], hdict2[key]))
        rel_diffs[key] = list(map(rel_diff, hdict1[key], hdict2[key]))

    err_dict = {}
    for key in horder:
        err_dict[key] = {}

        (abs_thresh, rel_thresh) = thresh_dict.lookup(key)

        max_abs_diff = max(abs_diffs[key])
        index_max_abs_diff = abs_diffs[key].index(max_abs_diff)
        err_dict[key]['abs_thresh'] = abs_thresh
        err_dict[key]['max_abs_diff'] = max_abs_diff
        err_dict[key]['rel_diff_of_max_abs_diff'] = rel_diffs[key][
            index_max_abs_diff]
        err_dict[key]['time_of_max_abs_diff'] = tdict[tkey][index_max_abs_diff]
        err_dict[key]['count_of_small_abs_diff'] = sum(
            1 for x in abs_diffs[key] if 0.0 < x <= abs_thresh)
        err_dict[key]['count_of_big_abs_diff'] = sum(1 for x in abs_diffs[key]
                                                     if x > abs_thresh)

        max_rel_diff = max(rel_diffs[key])
        index_max_rel_diff = rel_diffs[key].index(max_rel_diff)

        err_dict[key]['rel_thresh'] = rel_thresh
        err_dict[key]['max_rel_diff'] = max_rel_diff
        err_dict[key]['abs_diff_of_max_rel_diff'] = abs_diffs[key][
            index_max_rel_diff]
        err_dict[key]['time_of_max_rel_diff'] = tdict[tkey][index_max_rel_diff]
        if rel_thresh > 0:
            err_dict[key]['count_of_small_rel_diff'] = sum(
                1 for x in rel_diffs[key] if 0.0 < x <= rel_thresh)
            err_dict[key]['count_of_big_rel_diff'] = sum(
                1 for x in rel_diffs[key] if x > rel_thresh)
        else:
            err_dict[key]['count_of_small_rel_diff'] = 0
            err_dict[key]['count_of_big_rel_diff'] = 0

        if rel_thresh > 0:
            err_dict[key]['count_of_small_abs_rel_diff'] = sum(
                1 for x, y in zip(abs_diffs[key], rel_diffs[key])
                if 0 < x <= abs_thresh or 0 < y <= rel_thresh)
            err_dict[key]['count_of_big_abs_rel_diff'] = sum(
                1 for x, y in zip(abs_diffs[key], rel_diffs[key])
                if x > abs_thresh and y > rel_thresh)
        else:
            err_dict[key]['count_of_small_abs_rel_diff'] = err_dict[key][
                'count_of_small_abs_diff']
            err_dict[key]['count_of_big_abs_rel_diff'] = err_dict[key][
                'count_of_big_abs_diff']

    num_small = sum(err_dict[key]['count_of_small_abs_rel_diff']
                    for key in horder)
    num_big = sum(err_dict[key]['count_of_big_abs_rel_diff'] for key in horder)

    diff_type = 'All Equal'
    if num_big > 0:
        diff_type = 'Big Diffs'
    elif num_small > 0:
        diff_type = 'Small Diffs'

    num_records = len(tdict[tkey])

    input_file_path_tokens = inputfile1.split(os.sep)

    # if it's the first pass, create the file with the header;
    # also the null-pointer-ish check allows skipping the summary_csv file if the filename is blank
    if summary_csv:
        if not os.path.isfile(summary_csv):
            with open(summary_csv, 'w') as f:
                f.write("CaseName,FileName,Status,#Records\n")
        with open(summary_csv, 'a') as f:
            f.write("%s,%s,%s,%s records compared\n" %
                    (input_file_path_tokens[-2], input_file_path_tokens[-1],
                     diff_type, num_records))

    # We are done
    if diff_type == 'All Equal':
        return diff_type, num_records, num_big, num_small

    # Which columns had diffs?
    dhorder = [
        h for h in horder if err_dict[h]['count_of_small_abs_diff'] > 0
        or err_dict[h]['count_of_big_abs_diff'] > 0
        or err_dict[h]['count_of_small_rel_diff'] > 0
        or err_dict[h]['count_of_big_rel_diff'] > 0
    ]

    # Find the largest overall absolute diff
    max_max_abs_diff = max(err_dict[key]['max_abs_diff'] for key in dhorder)
    key_of_max_max_abs_diff = [
        key for key in dhorder
        if err_dict[key]['max_abs_diff'] == max_max_abs_diff
    ][0]
    rel_diff_of_max_max_abs_diff = err_dict[key_of_max_max_abs_diff][
        'rel_diff_of_max_abs_diff']
    time_of_max_max_abs_diff = err_dict[key_of_max_max_abs_diff][
        'time_of_max_abs_diff']

    # Find the largest overall relative diff
    max_max_rel_diff = max(err_dict[key]['max_rel_diff'] for key in dhorder)
    key_of_max_max_rel_diff = [
        key for key in dhorder
        if err_dict[key]['max_rel_diff'] == max_max_rel_diff
    ][0]
    abs_diff_of_max_max_rel_diff = err_dict[key_of_max_max_rel_diff][
        'abs_diff_of_max_rel_diff']
    time_of_max_max_rel_diff = err_dict[key_of_max_max_rel_diff][
        'time_of_max_rel_diff']

    # put the time column back
    abs_diffs[tkey] = tdict[tkey]
    rel_diffs[tkey] = tdict[tkey]

    # Summarize the input files
    summary_dict1 = make_summary_dict(tdict, hdict1)
    summary_dict2 = make_summary_dict(tdict, hdict2)

    # Flatten summaries out to dictionaries of lists rather than dictionaries of dictionaries
    summary_dict12 = dict_of_dicts2dict_of_lists(summary_dict1, horder,
                                                 list(summary_labels))
    summary_dict12[tkey] = [sl + ':' for sl in list(summary_labels)]

    summary_dict22 = dict_of_dicts2dict_of_lists(summary_dict2, horder,
                                                 list(summary_labels))
    summary_dict22[tkey] = [sl + ':' for sl in list(summary_labels)]

    # Diff the flattend summaries
    abs_diff_summary_dict = {}
    rel_diff_summary_dict = {}
    for key in dhorder:
        abs_diff_summary_dict[key] = map(abs_diff, summary_dict12[key],
                                         summary_dict22[key])
        rel_diff_summary_dict[key] = map(rel_diff, summary_dict12[key],
                                         summary_dict22[key])

    # Prepend time key to header order list
    thorder = [tkey] + horder
    tdhorder = [tkey] + dhorder

    # Convert the absolute and relative diff dictionaries to matrices and write them to files
    abs_diff_mat = hdict2matrix(tdhorder, abs_diffs)
    # print("Trying to write to %s " % abs_diff_file)
    mycsv.writecsv(abs_diff_mat, abs_diff_file)
    rel_diff_mat = hdict2matrix(tdhorder, rel_diffs)
    mycsv.writecsv(rel_diff_mat, rel_diff_file)

    # Write the error file header
    mycsv.writecsv(
        [[],
         [
             'Max absolute diff: %s, field: %s, time: %s, relative: %s' %
             (str(max_max_abs_diff), str(key_of_max_max_abs_diff),
              str(time_of_max_max_abs_diff), str(rel_diff_of_max_max_abs_diff))
         ]], err_file, 'a')
    mycsv.writecsv(
        [[],
         [
             'Max relative diff: %s, field: %s, time: %s, absolute: %s' %
             (str(max_max_rel_diff), str(key_of_max_max_rel_diff),
              str(time_of_max_max_rel_diff), str(abs_diff_of_max_max_rel_diff))
         ]], err_file, 'a')

    # Convert the error dictionary to a matrix and write to the error
    # file.  Need to convert it from a nested dictionary to a
    # dictionary of lists first.
    err_dict2 = dict_of_dicts2dict_of_lists(err_dict, horder,
                                            list(error_labels))
    err_dict2[tkey] = [el + ':' for el in list(error_labels)]

    err_mat = hdict2matrix(tdhorder, err_dict2)
    mycsv.writecsv([[], []] + err_mat, err_file, 'a')

    # Convert the summaries to matrices and write them out to the error file
    summary_mat1 = hdict2matrix(thorder, summary_dict12)
    mycsv.writecsv([[], [], ['Summary of %s' % (inputfile1, )], []] +
                   summary_mat1, err_file, 'a')
    summary_mat2 = hdict2matrix(thorder, summary_dict22)
    mycsv.writecsv([[], [], ['Summary of %s' % (inputfile2, )], []] +
                   summary_mat2, err_file, 'a')

    # Convert the absolute and relative differences of the summaries and write them to the error file
    abs_diff_summary_dict[tkey] = [sl + ':' for sl in list(summary_labels)]
    abs_diff_summary_mat = hdict2matrix(tdhorder, abs_diff_summary_dict)
    mycsv.writecsv(
        [[], [],
         [
             'Absolute difference in Summary of %s and Summary of %s' %
             (inputfile1, inputfile2)
         ], []] + abs_diff_summary_mat, err_file, 'a')
    rel_diff_summary_dict[tkey] = [sl + ':' for sl in list(summary_labels)]
    rel_diff_summary_mat = hdict2matrix(tdhorder, rel_diff_summary_dict)
    mycsv.writecsv(
        [[], [],
         [
             'Relative difference in Summary of %s and Summary of %s' %
             (inputfile1, inputfile2)
         ], []] + rel_diff_summary_mat, err_file, 'a')

    return diff_type, num_records, num_big, num_small
Ejemplo n.º 2
0
def info(line, logfile=None):
    if logfile:
        mycsv.writecsv([[line]], logfile, 'a')
Ejemplo n.º 3
0
 def test_valid_write_to_string(self):
     this_matrix = [[u'hi', u'bye'], [1, 2], [3.14159, 2.71828]]
     out_string = writecsv(this_matrix)
     self.assertIsInstance(out_string, str)
Ejemplo n.º 4
0
 def test_invalid_matrix_fails(self):
     with self.assertRaises(BadMatrice):
         writecsv(0)
Ejemplo n.º 5
0
 def test_valid_write_to_file(self):
     this_matrix = [['hi', 'bye'], [1, 2], [3.14159, 2.71828]]
     csv_file = tempfile.mkstemp(suffix='.csv')[1]
     writecsv(
         this_matrix,
         csv_file)  # should just successfully write, no need to re-read it