Esempio n. 1
0
def evaluate_link(class_match_set, class_nonmatch_set, true_match_set,
                  all_comparisons):
    # Linkage evaluation
    linkage_result = evaluation.confusion_matrix(class_match_set,
                                                 class_nonmatch_set,
                                                 true_match_set,
                                                 all_comparisons)

    accuracy = evaluation.accuracy(linkage_result)
    precision = evaluation.precision(linkage_result)
    recall = evaluation.recall(linkage_result)
    fmeasure = evaluation.fmeasure(linkage_result)

    print('Linkage evaluation:')
    print('  Accuracy:    %.6f' % (accuracy))
    print('  Precision:   %.6f' % (precision))
    print('  Recall:      %.6f' % (recall))
    print('  F-measure:   %.6f' % (fmeasure))
    print('')
Esempio n. 2
0
    fn = 0

    f = open(file, 'r')
    for line in f:
        lines += 1
        totals = line.split()
        # totals[0] is image name
        tp += int(totals[1])
        fp += int(totals[2])
        tn += int(totals[3])
        fn += int(totals[4])
    f.close()

    precision = evaluation.precision(tp, fp)
    sensitivity = evaluation.sensitivity(tp, fn)
    fmeasure = evaluation.fmeasure(tp, fp, fn)
    dicecoeff = evaluation.dicecoeff(tp, fp, fn)
    jaccardindex = evaluation.jaccardindex(tp, fp, fn)

    r = open('circle_found_results.txt', 'a')
    r.write('File name: ' + file + '\n')
    r.write('Lines: ' + str(lines) + '\n')
    r.write('True Positives: ' + str(tp) + '\n')
    r.write('False Positives: ' + str(fp) + '\n')
    r.write('True Negatives: ' + str(tn) + '\n')
    r.write('False Negatives: ' + str(fn) + '\n')
    r.write('Precision: ' + str(precision) + '\n')
    r.write('Sensitivity: ' + str(sensitivity) + '\n')
    r.write('F-Measure: ' + str(fmeasure) + '\n')
    r.write('Dice Coefficent: ' + str(dicecoeff) + '\n')
    r.write('Jaccard Index: ' + str(jaccardindex) + '\n' + '\n')
Esempio n. 3
0
    tn = 0
    fn = 0

    f = open(file, 'r')
    for line in f:
        lines += 1
        totals = line.split()
        tp += int(totals[0])
        fp += int(totals[1])
        tn += int(totals[2])
        fn += int(totals[3])
    f.close()

    precision = evaluation.precision(tp, fp)
    sensitivity = evaluation.sensitivity(tp, fn)
    fmeasure = evaluation.fmeasure(tp, fp, fn)
    dicecoeff = evaluation.dicecoeff(tp, fp, fn)
    jaccardindex = evaluation.jaccardindex(tp, fp, fn)

    r = open('circle_found_results.txt', 'a')
    r.write('File name: ' + file + '\n')
    r.write('Lines: ' + str(lines) + '\n')
    r.write('True Positives: ' + str(tp) + '\n')
    r.write('False Positives: ' + str(fp) + '\n')
    r.write('True Negatives: ' + str(tn) + '\n')
    r.write('False Negatives: ' + str(fn) + '\n')
    r.write('Precision: ' + str(precision) + '\n')
    r.write('Sensitivity: ' + str(sensitivity) + '\n')
    r.write('F-Measure: ' + str(fmeasure) + '\n')
    r.write('Dice Coefficent: ' + str(dicecoeff) + '\n')
    r.write('Jaccard Index: ' + str(jaccardindex) + '\n' + '\n')
Esempio n. 4
0
print('Blocking evaluation:')
print('  Reduction ratio:    %.3f' % (rr))
print('  Pairs completeness: %.3f' % (pc))
print('  Pairs quality:      %.3f' % (pq))
print('')

# Linkage evaluation
#
linkage_result = evaluation.confusion_matrix(class_match_set,
                                             class_nonmatch_set,
                                             true_match_set, all_comparisons)

accuracy = evaluation.accuracy(linkage_result)
precision = evaluation.precision(linkage_result)
recall = evaluation.recall(linkage_result)
fmeasure = evaluation.fmeasure(linkage_result)

print('Linkage evaluation:')
print('  Accuracy:    %.3f' % (accuracy))
print('  Precision:   %.3f' % (precision))
print('  Recall:      %.3f' % (recall))
print('  F-measure:   %.3f' % (fmeasure))
print('')

linkage_time = loading_time + blocking_time + comparison_time + \
               classification_time
print('Total runtime required for linkage: %.3f sec' % (linkage_time))

# -----------------------------------------------------------------------------

# End of program.
Esempio n. 5
0
def main(blocking_fn,
         classification_fn,
         threshold,
         minthresh,
         weightvec,
         blocking_attrs,
         func_list,
         save=False):

    # ******** In lab 3, explore different attribute sets for blocking ************

    # The list of attributes to use for blocking (all must occur in the above
    # attribute lists)
    blocking_attrA_list = blocking_attrs
    blocking_attrB_list = blocking_attrs

    # ******** In lab 4, explore different comparison functions for different  ****
    # ********           attributes                                            ****

    # The list of tuples (comparison function, attribute number in record A,
    # attribute number in record B)
    #
    exact_comp_funct_list = [
        (comparison.exact_comp, 1, 1),  # First name
        (comparison.exact_comp, 2, 2),  # Middle name
        (comparison.exact_comp, 3, 3),  # Last name
        (comparison.exact_comp, 8, 8),  # Suburb
        (comparison.exact_comp, 10, 10),  # State
    ]

    approx_comp_funct_list = [
        (func_list[0], 1, 1),  # First name
        (func_list[1], 2, 2),  # Middle name
        (func_list[2], 3, 3),  # Last name
        (func_list[3], 7, 7),  # Address
        (func_list[4], 8, 8),  # Suburb
        (func_list[5], 10, 10),  # State
    ]

    # =============================================================================
    #
    # Step 1: Load the two datasets from CSV files

    start_time = time.time()

    recA_dict = loadDataset.load_data_set(datasetA_name, rec_idA_col, \
                                          attrA_list, headerA_line)
    recB_dict = loadDataset.load_data_set(datasetB_name, rec_idB_col, \
                                          attrB_list, headerB_line)

    # Load data set of true matching pairs
    #
    true_match_set = loadDataset.load_truth_data(truthfile_name)

    loading_time = time.time() - start_time

    # -----------------------------------------------------------------------------
    # Step 2: Block the datasets

    def genericBlock(block_function='none',
                     recA_dict=recA_dict,
                     recB_dict=recB_dict,
                     blocking_attrA_list=blocking_attrA_list,
                     blocking_attrB_list=blocking_attrB_list):

        start_time = time.time()

        # Select one blocking technique
        if block_function == 'none':
            # No blocking (all records in one block)
            #
            resultA = blocking.noBlocking(recA_dict)
            resultB = blocking.noBlocking(recB_dict)

        if block_function == 'attr':
            # Simple attribute-based blocking
            #
            resultA = blocking.simpleBlocking(recA_dict, blocking_attrA_list)
            resultB = blocking.simpleBlocking(recB_dict, blocking_attrB_list)

        if block_function == 'soundex':
            # Phonetic (Soundex) based blocking
            #
            resultA = blocking.phoneticBlocking(recA_dict, blocking_attrA_list)
            resultB = blocking.phoneticBlocking(recB_dict, blocking_attrB_list)

        if block_function == 'slk':
            # Statistical linkage key (SLK-581) based blocking
            #
            fam_name_attr_ind = 3
            giv_name_attr_ind = 1
            dob_attr_ind = 6
            gender_attr_ind = 4

            resultA = blocking.slkBlocking(recA_dict, fam_name_attr_ind, \
                                              giv_name_attr_ind, dob_attr_ind, \
                                              gender_attr_ind)
            resultB = blocking.slkBlocking(recB_dict, fam_name_attr_ind, \
                                              giv_name_attr_ind, dob_attr_ind, \
                                              gender_attr_ind)

        block_time = time.time() - start_time

        # Print blocking statistics
        #
        # blocking.printBlockStatistics(resultA, resultB)

        return resultA, resultB, block_time

    blockA_dict, blockB_dict, blocking_time = genericBlock(
        block_function=blocking_fn)
    # -----------------------------------------------------------------------------
    # Step 3: Compare the candidate pairs

    start_time = time.time()

    sim_vec_dict = comparison.compareBlocks(blockA_dict, blockB_dict, \
                                            recA_dict, recB_dict, \
                                            approx_comp_funct_list)

    comparison_time = time.time() - start_time

    # -----------------------------------------------------------------------------
    # Step 4: Classify the candidate pairs

    def genericClassification(classification_function='exact',
                              sim_vec_dict=sim_vec_dict,
                              sim_threshold=threshold,
                              min_sim_threshold=minthresh,
                              weight_vec=weightvec,
                              true_match_set=true_match_set):
        start_time = time.time()

        if classification_function == 'exact':
            # Exact matching based classification
            class_match_set1, class_nonmatch_set1 = \
                         classification.exactClassify(sim_vec_dict)

        if classification_function == 'simthresh':
            # Similarity threshold based classification
            #
            class_match_set1, class_nonmatch_set1 = \
                        classification.thresholdClassify(sim_vec_dict, sim_threshold)

        if classification_function == 'minsim':
            # Minimum similarity threshold based classification
            #
            class_match_set1, class_nonmatch_set1 = \
                        classification.minThresholdClassify(sim_vec_dict,
                                                            min_sim_threshold)

        if classification_function == 'weightsim':
            # Weighted similarity threshold based classification
            #
            # weight_vec = [1.0] * len(approx_comp_funct_list)

            # Lower weights for middle name and state
            #
            # weight_vec = [2.0, 1.0, 2.0, 2.0, 2.0, 1.0]

            class_match_set1, class_nonmatch_set1 = \
                        classification.weightedSimilarityClassify(sim_vec_dict,
                                                                  weight_vec,
                                                                  sim_threshold)

        if classification_function == 'dt':
            # A supervised decision tree classifier
            #
            class_match_set1, class_nonmatch_set1 = \
                      classification.supervisedMLClassify(sim_vec_dict, true_match_set)

        class_time = time.time() - start_time

        return class_match_set1, class_nonmatch_set1, class_time

    threshold = minthresh

    class_match_set, class_nonmatch_set, classification_time = genericClassification(
        classification_fn)

    # -----------------------------------------------------------------------------
    # Step 5: Evaluate the classification

    # Initialise dictionary of results
    dict = {}

    # Get the number of record pairs compared
    #
    num_comparisons = len(sim_vec_dict)

    # Get the number of total record pairs to compared if no blocking used
    #
    all_comparisons = len(recA_dict) * len(recB_dict)

    # Get the list of identifiers of the compared record pairs
    #
    cand_rec_id_pair_list = sim_vec_dict.keys()

    # Blocking evaluation
    #
    rr = evaluation.reduction_ratio(num_comparisons, all_comparisons)
    pc = evaluation.pairs_completeness(cand_rec_id_pair_list, true_match_set)
    pq = evaluation.pairs_quality(cand_rec_id_pair_list, true_match_set)

    # Linkage evaluation
    #
    linkage_result = evaluation.confusion_matrix(class_match_set,
                                                 class_nonmatch_set,
                                                 true_match_set,
                                                 all_comparisons)

    accuracy = evaluation.accuracy(linkage_result)
    precision = evaluation.precision(linkage_result)
    recall = evaluation.recall(linkage_result)
    fmeasure = evaluation.fmeasure(linkage_result)

    # print('Linkage evaluation:')
    # print('  Accuracy:    %.3f' % (accuracy))
    # print('  Precision:   %.3f' % (precision))
    # print('  Recall:      %.3f' % (recall))
    # print('  F-measure:   %.3f' % (fmeasure))
    # print('')

    linkage_time = loading_time + blocking_time + comparison_time + \
                   classification_time
    # print('Total runtime required for linkage: %.3f sec' % (linkage_time))

    # Export blocking metrics
    dict['blocking_fn'] = blocking_fn
    dict['classification_fn'] = classification_fn
    dict['threshold'] = threshold
    dict['min_thresh'] = minthresh
    dict['weight_vec'] = weightvec
    dict['blocking_attrs'] = blocking_attrs
    dict['comp_funcs'] = func_list
    dict['num_comparisons'] = num_comparisons
    dict['all_comparisons'] = all_comparisons
    # dict['cand_rec_id_pair_list'] = cand_rec_id_pair_list
    dict['rr'] = rr
    dict['pc'] = pc
    dict['pq'] = pq
    dict['blocking_time'] = blocking_time
    # dict['linkage_result'] = linkage_result
    dict['accuracy'] = accuracy
    dict['precision'] = precision
    dict['recall'] = recall
    dict['fmeasure'] = fmeasure
    dict['linkage_time'] = linkage_time

    # Save results
    if save:
        saveLinkResult.save_linkage_set('final_results.txt', class_match_set)

    # Return results
    return dict