def crossValidation(initialTrainingDataSet, allConditions, k): listOfFolds = makelistOfFolds(initialTrainingDataSet, k) kAccuracies = [] for i in range(k): trainingData, testingData = getIthTrainingAndTestingData(listOfFolds, i) adtClassifer = adt.classifer(trainingData, allConditions) testingDataOutput = adt.evaluate(testingData, adtClassifer) kAccuracies.append(getAccuracy(testingDataOutput, initialTrainingDataSet)) return computeFinalAccuracy(kAccuracies)
def main(): if len(sys.argv) != 3: Usage() return testing_data_file = sys.argv[1] adtree_file = sys.argv[2] if os.path.exists(testing_data_file) and os.path.exists(adtree_file): adtree = adt_infrastructure.ReCreateADTree(adtree_file, adt) (records, keys) = adt_infrastructure.GetTestingData(testing_data_file) testingDataSet = {} # n! #------------- = number of unique combinations #(n - r)! * r! num_records = len(records) print " ***** Number of records : %s" % num_records print " Computing number of combinations..." num_combinations = math.factorial(num_records) / ( math.factorial(num_records - 2) * math.factorial(2) ) print " ***** Number of combinations: %s" % num_combinations # test unique combinations for record linkage (duplicates) combination_pairs = [] total_processed = 0 number_of_matches = 0 number_of_nonmatches = 0 for combination_1 in range(num_records): total_processed = total_processed + (num_records - combination_1) - 1 percent_complete = float(total_processed) / float(num_combinations) * 100 print " ***** %.2f%% complete ----- total processed: %s" % (percent_complete, total_processed) for combination_2 in range(combination_1+1, num_records): tempDictionary = {} for key in keys[1:]: levenshtein_distance = levenshtein.Compute_Levenshtein_Distance(records[combination_1][key], records[combination_2][key]) tempDictionary[key] = levenshtein_distance combination_pair = str(combination_1) + '-' + str(combination_2) combination_pairs.append(combination_pair) testingDataSet[combination_pair] = tempDictionary outputDatabase = adt.evaluate(testingDataSet, adtree) for combination_pair in outputDatabase.keys(): print combination_pair else: Usage() return
def main(): global trainingDataSet if len(sys.argv) != 3: Usage() return testing_data_file = sys.argv[1] adtree_file = sys.argv[2] if os.path.exists(testing_data_file) and os.path.exists(adtree_file): adtree = ReCreateADTree(adtree_file) (records, keys) = adt_infrastructure.BuildDataStructure(testing_data_file) trainingDataSet = {} # n! #------------- = number of unique combinations #(n - r)! * r! num_records = len(records) print " ***** Number of records : %s" % num_records print " Computing number of combinations..." num_combinations = math.factorial(num_records) / ( math.factorial(num_records - 2) * math.factorial(2) ) print " ***** Number of combinations: %s" % num_combinations # test unique combinations for record linkage (duplicates) combination_pairs = [] total_processed = 0 for combination_1 in range(num_records): total_processed = total_processed + (num_records - combination_1) - 1 percent_complete = float(total_processed) / float(num_combinations) * 100 print " ***** %.2f%% complete ----- total processed: %s" % (percent_complete, total_processed) for combination_2 in range(combination_1+1, num_records): tempDictionary = {} if levenshtein.Compute_Levenshtein_Distance(records[combination_1]["ID"], records[combination_2]["ID"]) == 0: tempDictionary['class'] = 'same' else: tempDictionary['class'] = 'different' for key in keys[1:]: levenshtein_distance = levenshtein.Compute_Levenshtein_Distance(records[combination_1][key], records[combination_2][key]) tempDictionary[key] = levenshtein_distance combination_pair = str(combination_1) + '-' + str(combination_2) combination_pairs.append(combination_pair) trainingDataSet[combination_pair] = tempDictionary outputDatabase = adt.evaluate(trainingDataSet, adtree) correct_classifications = 0 incorrect_classifications = 0 false_positives = 0 false_negatives = 0 for combination_pair in outputDatabase.keys(): pair = combination_pair.split("-") if records[int(pair[0])]["ID"] == records[int(pair[1])]["ID"] and outputDatabase[combination_pair]["classification"] == "same": correct_classifications = correct_classifications + 1 elif records[int(pair[0])]["ID"] != records[int(pair[1])]["ID"] and outputDatabase[combination_pair]["classification"] == "different": correct_classifications = correct_classifications + 1 else: if outputDatabase[combination_pair]["classification"] == "same": print "False Positive:" false_positives = false_positives + 1 elif outputDatabase[combination_pair]["classification"] == "different": print "False Negative:" false_negatives = false_negatives + 1 print " record[%s][ID] = %s --- address: %s" % (pair[0], records[int(pair[0])]["ID"], records[int(pair[0])]["FullAddres"]) print " record[%s][ID] = %s --- address: %s" % (pair[1], records[int(pair[1])]["ID"], records[int(pair[1])]["FullAddres"]) print " classification = %s" % outputDatabase[combination_pair]["classification"] incorrect_classifications = incorrect_classifications + 1 print "" print "Number of records : %s" % num_records print "Number of combinations : %s" % num_combinations print "" total_classifications = correct_classifications + incorrect_classifications print "Correct classifications : %s" % correct_classifications print "Incorrect classifications: %s" % incorrect_classifications classification_accuracy_ratio = float(correct_classifications) / float(total_classifications) classification_accuracy_percentage = classification_accuracy_ratio * 100.0 print "Classification Accuracy : %.2f%%" % classification_accuracy_percentage print "" print "False Positives: %s" % false_positives print "False Negatives: %s" % false_negatives else: Usage() return
def main(): global trainingDataSet for i, arg in enumerate(sys.argv): print "arg: %s: %s" % (i, arg) argc = len(sys.argv) if argc != 4: Usage() return input_file = sys.argv[1] conditions_file = sys.argv[2] output_file = sys.argv[3] if os.path.exists(input_file): (records, keys) = adt_infrastructure.BuildDataStructure(input_file) trainingDataSet = {} # n! #------------- = number of unique combinations #(n - r)! * r! num_records = len(records) print " ***** Number of records : %s" % num_records print " Computing number of combinations..." num_combinations = math.factorial(num_records) / ( math.factorial(num_records - 2) * math.factorial(2) ) print " ***** Number of combinations: %s" % num_combinations # test unique combinations for record linkage (duplicates) combination_pairs = [] total_processed = 0 for combination_1 in range(num_records): total_processed = total_processed + (num_records - combination_1) - 1 percent_complete = float(total_processed) / float(num_combinations) * 100 print " ***** %.2f%% complete ----- total processed: %s" % (percent_complete, total_processed) for combination_2 in range(combination_1+1, num_records): tempDictionary = {} if levenshtein.Compute_Levenshtein_Distance(records[combination_1]["ID"], records[combination_2]["ID"]) == 0: tempDictionary['class'] = 'same' else: tempDictionary['class'] = 'different' for key in keys[1:]: levenshtein_distance = levenshtein.Compute_Levenshtein_Distance(records[combination_1][key], records[combination_2][key]) tempDictionary[key] = levenshtein_distance combination_pair = str(combination_1) + '-' + str(combination_2) combination_pairs.append(combination_pair) trainingDataSet[combination_pair] = tempDictionary #keys.remove("ID") # need to remove the key "ID" since it has no bearing anymore (and is not a key in the trainingDataSet dictionary) #OutputRecordsToTabulatedFile(trainingDataSet, output_file, combination_pairs, keys) allConditions = PopulateConditions(conditions_file) print("allConditions: %s" % allConditions) #Produce trained tree and results adtClassifier = adt.classifier(trainingDataSet, allConditions, "people") print("adtClassifier: %s" % adtClassifier) adt_infrastructure.WriteTreeToFile(adtClassifier, output_file) outputDatabase = adt.evaluate(trainingDataSet, adtClassifier) #adtClassifier = adt.classifier(trainingDataSet, allConditions) #results = run10FoldCrossValidation(output_file, adtClassifier) #print("accuracy: " + str(results)) stop_drawing = open("node_end.txt", 'w') stop_drawing.close() else: Usage() return
def main(): if len(sys.argv) != 4: Usage() return training_comparisons_file = sys.argv[1] adtree_file = sys.argv[2] output_file_prefix = sys.argv[3] if os.path.exists(training_comparisons_file) and os.path.exists(adtree_file): adtree = adt_infrastructure.ReCreateADTree(adtree_file, adt) number_of_matches, number_of_nonmatches, trainingComparisonPairs = adt_infrastructure.BuildComparisonPairsDataStructure(training_comparisons_file) outputDatabase = adt.evaluate(trainingComparisonPairs, adtree) number_of_matches = float(number_of_matches) number_of_nonmatches = float(number_of_nonmatches) correct_classifications = 0.0 incorrect_classifications = 0.0 number_of_predicted_matches = 0.0 number_of_predicted_nonmatches = 0.0 true_positives = 0.0 true_negatives = 0.0 false_positives = 0.0 false_negatives = 0.0 for combination_pair in outputDatabase.keys(): # true positives if trainingComparisonPairs[combination_pair]["classification"] == SAME and outputDatabase[combination_pair]["classification"] == SAME: true_positives = true_positives + 1.0 correct_classifications = correct_classifications + 1.0 number_of_predicted_matches = number_of_predicted_matches + 1.0 # true negatives elif trainingComparisonPairs[combination_pair]["classification"] == DIFFERENT and outputDatabase[combination_pair]["classification"] == DIFFERENT: true_negatives = true_negatives + 1.0 correct_classifications = correct_classifications + 1.0 number_of_predicted_nonmatches = number_of_predicted_nonmatches + 1.0 else: # false positives if outputDatabase[combination_pair]["classification"] == SAME: #print "False Positive:", outputDatabase[combination_pair] false_positives = false_positives + 1.0 incorrect_classifications = incorrect_classifications + 1.0 number_of_predicted_matches = number_of_predicted_matches + 1.0 # false negatives elif outputDatabase[combination_pair]["classification"] == DIFFERENT: #print "False Negative:", outputDatabase[combination_pair] false_negatives = false_negatives + 1.0 incorrect_classifications = incorrect_classifications + 1.0 number_of_predicted_nonmatches = number_of_predicted_nonmatches + 1.0 else: print " **************************** THIS SHOULD NEVER HAPPEN" # Precision = TruePositives / (TruePositives + FalseNegatives) precision_denominator = true_positives + false_negatives if precision_denominator == 0: print " ### ERROR: Precision could not be calculated because (true_positives + false_negatives) = 0" precision_denominator = 0.001 precision = true_positives / precision_denominator * 100.0 # Recall = TruePositives / (TruePositives + FalsePositives) recall_denominator = true_positives + false_positives if recall_denominator == 0: print " ### ERROR: Recall could not be calculated because (true_positives + false_positives) = 0" recall_denominator = 0.001 recall = true_positives / recall_denominator * 100.0 # (TruePositives + TrueNegatives) # Accuracy = -------------------------------------------------------------------- # (TruePositives + True Negatives + FalsePositives + FalseNegatives) accuracy_denominator = true_positives + true_negatives + false_positives + false_negatives if accuracy_denominator == 0: print " ### ERROR: Accuracy could not be calculated because (true_positives + true_negatives + false_positives + false_negatives) = 0" accuracy_denominator = 0.001 accuracy = (true_positives + true_negatives) / accuracy_denominator * 100.0 print "" print " Training Data Record-Pairs : %s" % len(outputDatabase) print "" print " Number of matches : %s" % str(int(number_of_matches)) print " Number of nonmatches : %s" % str(int(number_of_nonmatches)) print "" print " Number of predicted matches : %s" % str(int(number_of_predicted_matches)) print " Number of predicted nonmatches: %s" % str(int(number_of_predicted_nonmatches)) print "" print " True Positives : %s detected of %s (%.2f%%)" % (str(int(true_positives)), str(int(number_of_matches)), true_positives / number_of_matches * 100.0) print " True Negatives : %s detected of %s (%.2f%%)" % (str(int(true_negatives)), str(int(number_of_nonmatches)), true_negatives / number_of_nonmatches * 100.0) print " False Positives : %s" % str(int(false_positives)) print " False Negatives : %s" % str(int(false_negatives)) print "" print " Recall : %s%%" % precision print " Precision : %s%%" % recall print " Accuracy : %s%%" % accuracy print "" with open(output_file_prefix + "-Precision.txt", 'w') as f_precision: f_precision.write(str(precision)) f_precision.write("\n") with open(output_file_prefix + "-Recall.txt", 'w') as f_recall: f_recall.write(str(recall)) f_recall.write("\n") with open(output_file_prefix + "-Accuracy.txt", 'w') as f_accuracy: f_accuracy.write(str(accuracy)) f_accuracy.write("\n") else: Usage() return