def evaluate_link(class_match_set, class_nonmatch_set, true_match_set, all_comparisons): # Linkage evaluation linkage_result = evaluation.confusion_matrix(class_match_set, class_nonmatch_set, true_match_set, all_comparisons) accuracy = evaluation.accuracy(linkage_result) precision = evaluation.precision(linkage_result) recall = evaluation.recall(linkage_result) fmeasure = evaluation.fmeasure(linkage_result) print('Linkage evaluation:') print(' Accuracy: %.6f' % (accuracy)) print(' Precision: %.6f' % (precision)) print(' Recall: %.6f' % (recall)) print(' F-measure: %.6f' % (fmeasure)) print('')
fn = 0 f = open(file, 'r') for line in f: lines += 1 totals = line.split() # totals[0] is image name tp += int(totals[1]) fp += int(totals[2]) tn += int(totals[3]) fn += int(totals[4]) f.close() precision = evaluation.precision(tp, fp) sensitivity = evaluation.sensitivity(tp, fn) fmeasure = evaluation.fmeasure(tp, fp, fn) dicecoeff = evaluation.dicecoeff(tp, fp, fn) jaccardindex = evaluation.jaccardindex(tp, fp, fn) r = open('circle_found_results.txt', 'a') r.write('File name: ' + file + '\n') r.write('Lines: ' + str(lines) + '\n') r.write('True Positives: ' + str(tp) + '\n') r.write('False Positives: ' + str(fp) + '\n') r.write('True Negatives: ' + str(tn) + '\n') r.write('False Negatives: ' + str(fn) + '\n') r.write('Precision: ' + str(precision) + '\n') r.write('Sensitivity: ' + str(sensitivity) + '\n') r.write('F-Measure: ' + str(fmeasure) + '\n') r.write('Dice Coefficent: ' + str(dicecoeff) + '\n') r.write('Jaccard Index: ' + str(jaccardindex) + '\n' + '\n')
tn = 0 fn = 0 f = open(file, 'r') for line in f: lines += 1 totals = line.split() tp += int(totals[0]) fp += int(totals[1]) tn += int(totals[2]) fn += int(totals[3]) f.close() precision = evaluation.precision(tp, fp) sensitivity = evaluation.sensitivity(tp, fn) fmeasure = evaluation.fmeasure(tp, fp, fn) dicecoeff = evaluation.dicecoeff(tp, fp, fn) jaccardindex = evaluation.jaccardindex(tp, fp, fn) r = open('circle_found_results.txt', 'a') r.write('File name: ' + file + '\n') r.write('Lines: ' + str(lines) + '\n') r.write('True Positives: ' + str(tp) + '\n') r.write('False Positives: ' + str(fp) + '\n') r.write('True Negatives: ' + str(tn) + '\n') r.write('False Negatives: ' + str(fn) + '\n') r.write('Precision: ' + str(precision) + '\n') r.write('Sensitivity: ' + str(sensitivity) + '\n') r.write('F-Measure: ' + str(fmeasure) + '\n') r.write('Dice Coefficent: ' + str(dicecoeff) + '\n') r.write('Jaccard Index: ' + str(jaccardindex) + '\n' + '\n')
print('Blocking evaluation:') print(' Reduction ratio: %.3f' % (rr)) print(' Pairs completeness: %.3f' % (pc)) print(' Pairs quality: %.3f' % (pq)) print('') # Linkage evaluation # linkage_result = evaluation.confusion_matrix(class_match_set, class_nonmatch_set, true_match_set, all_comparisons) accuracy = evaluation.accuracy(linkage_result) precision = evaluation.precision(linkage_result) recall = evaluation.recall(linkage_result) fmeasure = evaluation.fmeasure(linkage_result) print('Linkage evaluation:') print(' Accuracy: %.3f' % (accuracy)) print(' Precision: %.3f' % (precision)) print(' Recall: %.3f' % (recall)) print(' F-measure: %.3f' % (fmeasure)) print('') linkage_time = loading_time + blocking_time + comparison_time + \ classification_time print('Total runtime required for linkage: %.3f sec' % (linkage_time)) # ----------------------------------------------------------------------------- # End of program.
def main(blocking_fn, classification_fn, threshold, minthresh, weightvec, blocking_attrs, func_list, save=False): # ******** In lab 3, explore different attribute sets for blocking ************ # The list of attributes to use for blocking (all must occur in the above # attribute lists) blocking_attrA_list = blocking_attrs blocking_attrB_list = blocking_attrs # ******** In lab 4, explore different comparison functions for different **** # ******** attributes **** # The list of tuples (comparison function, attribute number in record A, # attribute number in record B) # exact_comp_funct_list = [ (comparison.exact_comp, 1, 1), # First name (comparison.exact_comp, 2, 2), # Middle name (comparison.exact_comp, 3, 3), # Last name (comparison.exact_comp, 8, 8), # Suburb (comparison.exact_comp, 10, 10), # State ] approx_comp_funct_list = [ (func_list[0], 1, 1), # First name (func_list[1], 2, 2), # Middle name (func_list[2], 3, 3), # Last name (func_list[3], 7, 7), # Address (func_list[4], 8, 8), # Suburb (func_list[5], 10, 10), # State ] # ============================================================================= # # Step 1: Load the two datasets from CSV files start_time = time.time() recA_dict = loadDataset.load_data_set(datasetA_name, rec_idA_col, \ attrA_list, headerA_line) recB_dict = loadDataset.load_data_set(datasetB_name, rec_idB_col, \ attrB_list, headerB_line) # Load data set of true matching pairs # true_match_set = loadDataset.load_truth_data(truthfile_name) loading_time = time.time() - start_time # ----------------------------------------------------------------------------- # Step 2: Block the datasets def genericBlock(block_function='none', recA_dict=recA_dict, recB_dict=recB_dict, blocking_attrA_list=blocking_attrA_list, blocking_attrB_list=blocking_attrB_list): start_time = time.time() # Select one blocking technique if block_function == 'none': # No blocking (all records in one block) # resultA = blocking.noBlocking(recA_dict) resultB = blocking.noBlocking(recB_dict) if block_function == 'attr': # Simple attribute-based blocking # resultA = blocking.simpleBlocking(recA_dict, blocking_attrA_list) resultB = blocking.simpleBlocking(recB_dict, blocking_attrB_list) if block_function == 'soundex': # Phonetic (Soundex) based blocking # resultA = blocking.phoneticBlocking(recA_dict, blocking_attrA_list) resultB = blocking.phoneticBlocking(recB_dict, blocking_attrB_list) if block_function == 'slk': # Statistical linkage key (SLK-581) based blocking # fam_name_attr_ind = 3 giv_name_attr_ind = 1 dob_attr_ind = 6 gender_attr_ind = 4 resultA = blocking.slkBlocking(recA_dict, fam_name_attr_ind, \ giv_name_attr_ind, dob_attr_ind, \ gender_attr_ind) resultB = blocking.slkBlocking(recB_dict, fam_name_attr_ind, \ giv_name_attr_ind, dob_attr_ind, \ gender_attr_ind) block_time = time.time() - start_time # Print blocking statistics # # blocking.printBlockStatistics(resultA, resultB) return resultA, resultB, block_time blockA_dict, blockB_dict, blocking_time = genericBlock( block_function=blocking_fn) # ----------------------------------------------------------------------------- # Step 3: Compare the candidate pairs start_time = time.time() sim_vec_dict = comparison.compareBlocks(blockA_dict, blockB_dict, \ recA_dict, recB_dict, \ approx_comp_funct_list) comparison_time = time.time() - start_time # ----------------------------------------------------------------------------- # Step 4: Classify the candidate pairs def genericClassification(classification_function='exact', sim_vec_dict=sim_vec_dict, sim_threshold=threshold, min_sim_threshold=minthresh, weight_vec=weightvec, true_match_set=true_match_set): start_time = time.time() if classification_function == 'exact': # Exact matching based classification class_match_set1, class_nonmatch_set1 = \ classification.exactClassify(sim_vec_dict) if classification_function == 'simthresh': # Similarity threshold based classification # class_match_set1, class_nonmatch_set1 = \ classification.thresholdClassify(sim_vec_dict, sim_threshold) if classification_function == 'minsim': # Minimum similarity threshold based classification # class_match_set1, class_nonmatch_set1 = \ classification.minThresholdClassify(sim_vec_dict, min_sim_threshold) if classification_function == 'weightsim': # Weighted similarity threshold based classification # # weight_vec = [1.0] * len(approx_comp_funct_list) # Lower weights for middle name and state # # weight_vec = [2.0, 1.0, 2.0, 2.0, 2.0, 1.0] class_match_set1, class_nonmatch_set1 = \ classification.weightedSimilarityClassify(sim_vec_dict, weight_vec, sim_threshold) if classification_function == 'dt': # A supervised decision tree classifier # class_match_set1, class_nonmatch_set1 = \ classification.supervisedMLClassify(sim_vec_dict, true_match_set) class_time = time.time() - start_time return class_match_set1, class_nonmatch_set1, class_time threshold = minthresh class_match_set, class_nonmatch_set, classification_time = genericClassification( classification_fn) # ----------------------------------------------------------------------------- # Step 5: Evaluate the classification # Initialise dictionary of results dict = {} # Get the number of record pairs compared # num_comparisons = len(sim_vec_dict) # Get the number of total record pairs to compared if no blocking used # all_comparisons = len(recA_dict) * len(recB_dict) # Get the list of identifiers of the compared record pairs # cand_rec_id_pair_list = sim_vec_dict.keys() # Blocking evaluation # rr = evaluation.reduction_ratio(num_comparisons, all_comparisons) pc = evaluation.pairs_completeness(cand_rec_id_pair_list, true_match_set) pq = evaluation.pairs_quality(cand_rec_id_pair_list, true_match_set) # Linkage evaluation # linkage_result = evaluation.confusion_matrix(class_match_set, class_nonmatch_set, true_match_set, all_comparisons) accuracy = evaluation.accuracy(linkage_result) precision = evaluation.precision(linkage_result) recall = evaluation.recall(linkage_result) fmeasure = evaluation.fmeasure(linkage_result) # print('Linkage evaluation:') # print(' Accuracy: %.3f' % (accuracy)) # print(' Precision: %.3f' % (precision)) # print(' Recall: %.3f' % (recall)) # print(' F-measure: %.3f' % (fmeasure)) # print('') linkage_time = loading_time + blocking_time + comparison_time + \ classification_time # print('Total runtime required for linkage: %.3f sec' % (linkage_time)) # Export blocking metrics dict['blocking_fn'] = blocking_fn dict['classification_fn'] = classification_fn dict['threshold'] = threshold dict['min_thresh'] = minthresh dict['weight_vec'] = weightvec dict['blocking_attrs'] = blocking_attrs dict['comp_funcs'] = func_list dict['num_comparisons'] = num_comparisons dict['all_comparisons'] = all_comparisons # dict['cand_rec_id_pair_list'] = cand_rec_id_pair_list dict['rr'] = rr dict['pc'] = pc dict['pq'] = pq dict['blocking_time'] = blocking_time # dict['linkage_result'] = linkage_result dict['accuracy'] = accuracy dict['precision'] = precision dict['recall'] = recall dict['fmeasure'] = fmeasure dict['linkage_time'] = linkage_time # Save results if save: saveLinkResult.save_linkage_set('final_results.txt', class_match_set) # Return results return dict