Esempio n. 1
0
def crossValidation(initialTrainingDataSet, allConditions, k):
    listOfFolds = makelistOfFolds(initialTrainingDataSet, k)
    kAccuracies = []
    for i in range(k):
        trainingData, testingData = getIthTrainingAndTestingData(listOfFolds, i)
        adtClassifer = adt.classifer(trainingData, allConditions)
        testingDataOutput = adt.evaluate(testingData, adtClassifer)
        kAccuracies.append(getAccuracy(testingDataOutput, initialTrainingDataSet))
    return computeFinalAccuracy(kAccuracies)
def main():
	if len(sys.argv) != 3:
		Usage()
		return

	testing_data_file = sys.argv[1]
	adtree_file = sys.argv[2]

	if os.path.exists(testing_data_file) and os.path.exists(adtree_file):
		adtree = adt_infrastructure.ReCreateADTree(adtree_file, adt)

		(records, keys) = adt_infrastructure.GetTestingData(testing_data_file)
		testingDataSet = {}

		#    n!
		#------------- = number of unique combinations
		#(n - r)! * r!

		num_records = len(records)
		print " ***** Number of records     : %s" % num_records

		print " Computing number of combinations..."
		num_combinations = math.factorial(num_records) / ( math.factorial(num_records - 2) * math.factorial(2) )
		print " ***** Number of combinations: %s" % num_combinations

		# test unique combinations for record linkage (duplicates)
		combination_pairs = []
		total_processed = 0

		number_of_matches = 0
		number_of_nonmatches = 0

		for combination_1 in range(num_records):
			total_processed = total_processed + (num_records - combination_1) - 1
			percent_complete = float(total_processed) / float(num_combinations) * 100
			print " ***** %.2f%% complete ----- total processed: %s" % (percent_complete, total_processed)
			for combination_2 in range(combination_1+1, num_records):
				tempDictionary = {}

				for key in keys[1:]:
					levenshtein_distance = levenshtein.Compute_Levenshtein_Distance(records[combination_1][key], records[combination_2][key])
					tempDictionary[key] = levenshtein_distance

				combination_pair = str(combination_1) + '-' + str(combination_2)
				combination_pairs.append(combination_pair)
				testingDataSet[combination_pair] = tempDictionary

		outputDatabase = adt.evaluate(testingDataSet, adtree)

		for combination_pair in outputDatabase.keys():
			print combination_pair
	else:
		Usage()
		return
Esempio n. 3
0
def main():
	global trainingDataSet
	
	if len(sys.argv) != 3:
		Usage()
		return
	
	testing_data_file = sys.argv[1]
	adtree_file = sys.argv[2]
	
	if os.path.exists(testing_data_file) and os.path.exists(adtree_file):
		adtree = ReCreateADTree(adtree_file)
		
		(records, keys) = adt_infrastructure.BuildDataStructure(testing_data_file)
		trainingDataSet = {}
		
		#    n!
		#------------- = number of unique combinations
		#(n - r)! * r!
		
		num_records = len(records)
		print " ***** Number of records     : %s" % num_records
		
		print " Computing number of combinations..."
		num_combinations = math.factorial(num_records) / ( math.factorial(num_records - 2) * math.factorial(2) )
		print " ***** Number of combinations: %s" % num_combinations
		
		# test unique combinations for record linkage (duplicates)
		combination_pairs = []
		total_processed = 0
		for combination_1 in range(num_records):
			total_processed = total_processed + (num_records - combination_1) - 1
			percent_complete = float(total_processed) / float(num_combinations) * 100
			print " ***** %.2f%% complete ----- total processed: %s" % (percent_complete, total_processed)
			for combination_2 in range(combination_1+1, num_records):
				tempDictionary = {}
				if levenshtein.Compute_Levenshtein_Distance(records[combination_1]["ID"], records[combination_2]["ID"]) == 0:
					tempDictionary['class'] = 'same'
				else:
					tempDictionary['class'] = 'different'
				for key in keys[1:]:					
					levenshtein_distance = levenshtein.Compute_Levenshtein_Distance(records[combination_1][key], records[combination_2][key])
					tempDictionary[key] = levenshtein_distance
				
				combination_pair = str(combination_1) + '-' + str(combination_2)
				combination_pairs.append(combination_pair)
				trainingDataSet[combination_pair] = tempDictionary
		
		outputDatabase = adt.evaluate(trainingDataSet, adtree)
		
		correct_classifications = 0
		incorrect_classifications = 0
		
		false_positives = 0
		false_negatives = 0
		for combination_pair in outputDatabase.keys():
			pair = combination_pair.split("-")
			if records[int(pair[0])]["ID"] == records[int(pair[1])]["ID"] and outputDatabase[combination_pair]["classification"] == "same":
				correct_classifications = correct_classifications + 1
			elif records[int(pair[0])]["ID"] != records[int(pair[1])]["ID"] and outputDatabase[combination_pair]["classification"] == "different":
				correct_classifications = correct_classifications + 1
			else:
				if outputDatabase[combination_pair]["classification"] == "same":
					print "False Positive:"
					false_positives = false_positives + 1
				elif outputDatabase[combination_pair]["classification"] == "different":
					print "False Negative:"
					false_negatives = false_negatives + 1
				
				print "   record[%s][ID] = %s --- address: %s" % (pair[0], records[int(pair[0])]["ID"], records[int(pair[0])]["FullAddres"])
				print "   record[%s][ID] = %s --- address: %s" % (pair[1], records[int(pair[1])]["ID"], records[int(pair[1])]["FullAddres"])
				print "   classification = %s" % outputDatabase[combination_pair]["classification"]
				
				incorrect_classifications = incorrect_classifications + 1
		
		print ""
		print "Number of records        : %s" % num_records
		print "Number of combinations   : %s" % num_combinations
		
		print ""
		total_classifications = correct_classifications + incorrect_classifications
		print "Correct classifications  : %s" % correct_classifications
		print "Incorrect classifications: %s" % incorrect_classifications
		classification_accuracy_ratio = float(correct_classifications) / float(total_classifications)
		classification_accuracy_percentage = classification_accuracy_ratio * 100.0
		print "Classification Accuracy  : %.2f%%" % classification_accuracy_percentage
		
		print ""
		print "False Positives: %s" % false_positives
		print "False Negatives: %s" % false_negatives
		
	else:
		Usage()
		return
def main():
	global trainingDataSet
	for i, arg in enumerate(sys.argv):
		print "arg: %s: %s" % (i, arg) 
	argc = len(sys.argv)
	
	if argc != 4:
		Usage()
		return
	
	input_file = sys.argv[1]
	conditions_file = sys.argv[2]
	output_file = sys.argv[3]
	
	if os.path.exists(input_file):
		(records, keys) = adt_infrastructure.BuildDataStructure(input_file)
		trainingDataSet = {}
		
		#    n!
		#------------- = number of unique combinations
		#(n - r)! * r!
		
		num_records = len(records)
		print " ***** Number of records     : %s" % num_records
		
		print " Computing number of combinations..."
		num_combinations = math.factorial(num_records) / ( math.factorial(num_records - 2) * math.factorial(2) )
		print " ***** Number of combinations: %s" % num_combinations
		
		# test unique combinations for record linkage (duplicates)
		combination_pairs = []
		total_processed = 0
		for combination_1 in range(num_records):
			total_processed = total_processed + (num_records - combination_1) - 1
			percent_complete = float(total_processed) / float(num_combinations) * 100
			print " ***** %.2f%% complete ----- total processed: %s" % (percent_complete, total_processed)
			for combination_2 in range(combination_1+1, num_records):
				tempDictionary = {}
				if levenshtein.Compute_Levenshtein_Distance(records[combination_1]["ID"], records[combination_2]["ID"]) == 0:
					tempDictionary['class'] = 'same'
				else:
					tempDictionary['class'] = 'different'
				for key in keys[1:]:					
					levenshtein_distance = levenshtein.Compute_Levenshtein_Distance(records[combination_1][key], records[combination_2][key])
					tempDictionary[key] = levenshtein_distance
				
				combination_pair = str(combination_1) + '-' + str(combination_2)
				combination_pairs.append(combination_pair)
				trainingDataSet[combination_pair] = tempDictionary
		
		#keys.remove("ID") # need to remove the key "ID" since it has no bearing anymore (and is not a key in the trainingDataSet dictionary)
		#OutputRecordsToTabulatedFile(trainingDataSet, output_file, combination_pairs, keys)
		
		allConditions = PopulateConditions(conditions_file)
		print("allConditions: %s" % allConditions)
		
		#Produce trained tree and results
		adtClassifier = adt.classifier(trainingDataSet, allConditions, "people")
		print("adtClassifier: %s" % adtClassifier)
		
		adt_infrastructure.WriteTreeToFile(adtClassifier, output_file)
		outputDatabase = adt.evaluate(trainingDataSet, adtClassifier)
		
		#adtClassifier = adt.classifier(trainingDataSet, allConditions)
		#results = run10FoldCrossValidation(output_file, adtClassifier)
		#print("accuracy: " + str(results))
		stop_drawing = open("node_end.txt", 'w')
		stop_drawing.close()
	else:
		Usage()
		return
def main():
	if len(sys.argv) != 4:
		Usage()
		return

	training_comparisons_file = sys.argv[1]
	adtree_file = sys.argv[2]
	output_file_prefix = sys.argv[3]

	if os.path.exists(training_comparisons_file) and os.path.exists(adtree_file):
		adtree = adt_infrastructure.ReCreateADTree(adtree_file, adt)

		number_of_matches, number_of_nonmatches, trainingComparisonPairs = adt_infrastructure.BuildComparisonPairsDataStructure(training_comparisons_file)
		outputDatabase = adt.evaluate(trainingComparisonPairs, adtree)

		number_of_matches = float(number_of_matches)
		number_of_nonmatches = float(number_of_nonmatches)

		correct_classifications = 0.0
		incorrect_classifications = 0.0

		number_of_predicted_matches = 0.0
		number_of_predicted_nonmatches = 0.0

		true_positives = 0.0
		true_negatives = 0.0
		false_positives = 0.0
		false_negatives = 0.0
		for combination_pair in outputDatabase.keys():
			# true positives
			if trainingComparisonPairs[combination_pair]["classification"] == SAME and outputDatabase[combination_pair]["classification"] == SAME:
				true_positives = true_positives + 1.0
				correct_classifications = correct_classifications + 1.0
				number_of_predicted_matches = number_of_predicted_matches + 1.0
			# true negatives
			elif trainingComparisonPairs[combination_pair]["classification"] == DIFFERENT and outputDatabase[combination_pair]["classification"] == DIFFERENT:
				true_negatives = true_negatives + 1.0
				correct_classifications = correct_classifications + 1.0
				number_of_predicted_nonmatches = number_of_predicted_nonmatches + 1.0
			else:
				# false positives
				if outputDatabase[combination_pair]["classification"] == SAME:
					#print "False Positive:", outputDatabase[combination_pair]
					false_positives = false_positives + 1.0
					incorrect_classifications = incorrect_classifications + 1.0
					number_of_predicted_matches = number_of_predicted_matches + 1.0
				# false negatives
				elif outputDatabase[combination_pair]["classification"] == DIFFERENT:
					#print "False Negative:", outputDatabase[combination_pair]
					false_negatives = false_negatives + 1.0
					incorrect_classifications = incorrect_classifications + 1.0
					number_of_predicted_nonmatches = number_of_predicted_nonmatches + 1.0
				else:
					print " **************************** THIS SHOULD NEVER HAPPEN"

		# Precision = TruePositives / (TruePositives + FalseNegatives)
		precision_denominator = true_positives + false_negatives
		if precision_denominator == 0:
			print " ### ERROR: Precision could not be calculated because (true_positives + false_negatives) = 0"
			precision_denominator = 0.001
		precision = true_positives / precision_denominator * 100.0

		# Recall = TruePositives / (TruePositives + FalsePositives)
		recall_denominator = true_positives + false_positives
		if recall_denominator == 0:
			print " ### ERROR: Recall could not be calculated because (true_positives + false_positives) = 0"
			recall_denominator = 0.001
		recall = true_positives / recall_denominator * 100.0

		#             (TruePositives + TrueNegatives)
		# Accuracy = --------------------------------------------------------------------
		#             (TruePositives + True Negatives + FalsePositives + FalseNegatives)
		accuracy_denominator = true_positives + true_negatives + false_positives + false_negatives
		if accuracy_denominator == 0:
			print " ### ERROR: Accuracy could not be calculated because (true_positives + true_negatives + false_positives + false_negatives) = 0"
			accuracy_denominator = 0.001
		accuracy = (true_positives + true_negatives) / accuracy_denominator * 100.0

		print ""
		print " Training Data Record-Pairs    : %s" % len(outputDatabase)
		print ""
		print " Number of matches             : %s" % str(int(number_of_matches))
		print " Number of nonmatches          : %s" % str(int(number_of_nonmatches))
		print ""
		print " Number of predicted matches   : %s" % str(int(number_of_predicted_matches))
		print " Number of predicted nonmatches: %s" % str(int(number_of_predicted_nonmatches))
		print ""
		print " True Positives                : %s detected of %s (%.2f%%)" % (str(int(true_positives)), str(int(number_of_matches)), true_positives / number_of_matches * 100.0)
		print " True Negatives                : %s detected of %s (%.2f%%)" % (str(int(true_negatives)), str(int(number_of_nonmatches)), true_negatives / number_of_nonmatches * 100.0)
		print " False Positives               : %s" % str(int(false_positives))
		print " False Negatives               : %s" % str(int(false_negatives))
		print ""
		print " Recall                        : %s%%" % precision
		print " Precision                     : %s%%" % recall
		print " Accuracy                      : %s%%" % accuracy
		print ""

		with open(output_file_prefix + "-Precision.txt", 'w') as f_precision:
			f_precision.write(str(precision))
			f_precision.write("\n")
		with open(output_file_prefix + "-Recall.txt", 'w') as f_recall:
			f_recall.write(str(recall))
			f_recall.write("\n")
		with open(output_file_prefix + "-Accuracy.txt", 'w') as f_accuracy:
			f_accuracy.write(str(accuracy))
			f_accuracy.write("\n")

	else:
		Usage()
		return