def main():
	global trainingComparisonPairs
	for i, arg in enumerate(sys.argv):
		print "arg: %s: %s" % (i, arg) 
	argc = len(sys.argv)
	
	if argc != 5:
		Usage()
		return
	
	input_file = sys.argv[1]
	conditions_file = sys.argv[2]
	numTreeNodes = sys.argv[3]
	output_file = sys.argv[4]
	
	if os.path.exists(input_file):
		number_of_matches, number_of_non_matches, trainingComparisonPairs = adt_infrastructure.BuildComparisonPairsDataStructure(input_file)
		print trainingComparisonPairs["0-1"]
		
		allConditions = PopulateConditions(conditions_file)
		print("allConditions: %s" % allConditions)
		
		#Produce trained tree and results
		adtClassifier = adt.classifier(trainingComparisonPairs, numTreeNodes, allConditions)
		#print("adtClassifier: %s" % adtClassifier)
		
		adt_infrastructure.WriteTreeToFile(adtClassifier, output_file)
	else:
		Usage()
		return
コード例 #2
0
def main():
	global trainingDataSet
	for i, arg in enumerate(sys.argv):
		print "arg: %s: %s" % (i, arg) 
	argc = len(sys.argv)
	
	if argc != 4:
		Usage()
		return
	
	input_file = sys.argv[1]
	conditions_file = sys.argv[2]
	output_file = sys.argv[3]
	
	if os.path.exists(input_file):
		(records, keys) = adt_infrastructure.BuildDataStructure(input_file)
		trainingDataSet = {}
		
		#    n!
		#------------- = number of unique combinations
		#(n - r)! * r!
		
		num_records = len(records)
		print " ***** Number of records     : %s" % num_records
		
		print " Computing number of combinations..."
		num_combinations = math.factorial(num_records) / ( math.factorial(num_records - 2) * math.factorial(2) )
		print " ***** Number of combinations: %s" % num_combinations
		
		# test unique combinations for record linkage (duplicates)
		combination_pairs = []
		total_processed = 0
		for combination_1 in range(num_records):
			total_processed = total_processed + (num_records - combination_1) - 1
			percent_complete = float(total_processed) / float(num_combinations) * 100
			print " ***** %.2f%% complete ----- total processed: %s" % (percent_complete, total_processed)
			for combination_2 in range(combination_1+1, num_records):
				tempDictionary = {}
				if levenshtein.Compute_Levenshtein_Distance(records[combination_1]["ID"], records[combination_2]["ID"]) == 0:
					tempDictionary['class'] = 'same'
				else:
					tempDictionary['class'] = 'different'
				for key in keys[1:]:					
					levenshtein_distance = levenshtein.Compute_Levenshtein_Distance(records[combination_1][key], records[combination_2][key])
					tempDictionary[key] = levenshtein_distance
				
				combination_pair = str(combination_1) + '-' + str(combination_2)
				combination_pairs.append(combination_pair)
				trainingDataSet[combination_pair] = tempDictionary
		
		#keys.remove("ID") # need to remove the key "ID" since it has no bearing anymore (and is not a key in the trainingDataSet dictionary)
		#OutputRecordsToTabulatedFile(trainingDataSet, output_file, combination_pairs, keys)
		
		allConditions = PopulateConditions(conditions_file)
		print("allConditions: %s" % allConditions)
		
		#Produce trained tree and results
		adtClassifier = adt.classifier(trainingDataSet, allConditions, "people")
		print("adtClassifier: %s" % adtClassifier)
		
		adt_infrastructure.WriteTreeToFile(adtClassifier, output_file)
		outputDatabase = adt.evaluate(trainingDataSet, adtClassifier)
		
		#adtClassifier = adt.classifier(trainingDataSet, allConditions)
		#results = run10FoldCrossValidation(output_file, adtClassifier)
		#print("accuracy: " + str(results))
		stop_drawing = open("node_end.txt", 'w')
		stop_drawing.close()
	else:
		Usage()
		return