def classify(k, sample, training_data, att_names = None): if att_names == None: att_names = data_preprocessing.get_header() attribute_indexes = convert_att_names_to_indexes(att_names) # class_index = att_names.index("class") class_index = data_preprocessing.get_header().index("class") distances = [] class0_count = 0 class1_count = 0 for training_sample in training_data: dist = euclid_distance_squared(sample, training_sample, attribute_indexes) heappush(distances, (dist, training_sample) ) for i in range(k): (_, training_sample) = heappop(distances) if training_sample[class_index] == "class0": class0_count+=1 if training_sample[class_index] == "class1": class1_count+=1 if class0_count > class1_count: if sample[class_index] == 'class0': return 0,True return 0,False else: if sample[class_index] == 'class1': return 1,True return 1,False
def convert_att_names_to_indexes(attributes): attribute_indexes = [] for attribute_name in attributes: if attribute_name == "class": continue index = data_preprocessing.get_header().index(attribute_name) if index > -1: attribute_indexes.append(index) return attribute_indexes
def calculate_PDF(inputArray,class_zero,class_one,attr_names=False): if attr_names == False: headers = data_preprocessing.get_header() else: headers = attr_names pdf_array = {'class_zero':{},'class_one':{}} for header in headers[:-1]: pdf_array['class_zero'][header] = PDF_math(inputArray[header],class_zero[header]['mean'],class_zero[header]['sd']) pdf_array['class_one'][header] = PDF_math(inputArray[header],class_one[header]['mean'],class_one[header]['sd']) return pdf_array
def convert_array_to_dict(inputArray,custom_headers=False): headers = data_preprocessing.get_header() outputArray = {} for header in headers: # don't include class outputArray[header] = inputArray[headers.index(header)] if (custom_headers != False): # subset of headers given temp_output = {} for key in custom_headers: if key in outputArray: temp_output[key] = outputArray[key] outputArray = temp_output return outputArray
def calculate_mean_sd(inputData,attr_names=False): if attr_names == False: headers = data_preprocessing.get_header() # headers to data else: headers = attr_names class_one = {} # People with Diabetes class_zero = {} # People without Diabetes # Prepare arrays with initial data for header in headers[:-1]: class_one[header] = {'mean':0,"sd":0} class_zero[header] = {'mean':0,"sd":0} else: class_one['size'] = 0 class_zero['size'] = 0 # Calculate Mean # for row in inputData: dictRow = convert_array_to_dict(row,attr_names) class_name = dictRow.pop("class") if class_name == "class1": # for class_one for key in dictRow.keys(): class_one[key]['mean'] += dictRow[key] class_one['size'] += 1 # increment else: # for class_zero for key in dictRow.keys(): class_zero[key]['mean'] += dictRow[key] class_zero['size'] += 1 # increment for header in headers[:-1]: class_zero[header]['mean'] = class_zero[header]['mean']/class_zero['size'] class_one[header]['mean'] = class_one[header]['mean']/class_one['size'] # Calculate SD for row in inputData: dictRow = convert_array_to_dict(row, attr_names) class_name = dictRow.pop("class") if class_name == "class1": # for class_one for key in dictRow.keys(): class_one[key]['sd'] += math.pow((dictRow[key]-class_one[key]['mean']),2) # (xi - mean)^2 else: # for class_zero for key in dictRow.keys(): class_zero[key]['sd'] += math.pow((dictRow[key]-class_zero[key]['mean']),2) for header in headers[:-1]: class_zero[header]['sd'] = math.sqrt(class_zero[header]['sd']/class_zero['size']) # (total_sum/N)^1/2 class_one[header]['sd'] = math.sqrt(class_one[header]['sd']/class_one['size']) return class_zero,class_one;
def classify(inputArray,class_zero,class_one,attr_names=False): if attr_names == False: headers = data_preprocessing.get_header() else: headers = attr_names inputArray = convert_array_to_dict(inputArray, attr_names) pdf_array = calculate_PDF(inputArray,class_zero,class_one,attr_names) test_one_val = float(class_one['size'])/float(class_one['size']+class_zero['size']) # total percentage of classOne test_zero_val = float(class_zero['size'])/float(class_one['size']+class_zero['size']) # total percentage of classZero for header in headers[:-1]: # multiplying out the bayes value for 0 and 1 test_one_val = test_one_val * pdf_array['class_one'][header] test_zero_val = test_zero_val * pdf_array['class_zero'][header] #print "one: %f zero: %f "%(test_one_val,test_zero_val) if ((test_one_val - test_zero_val) >= 0): if inputArray['class'] == 'class1': # return True if actual == calculated return 1,True # for Diabetic return 1,False else: if inputArray['class'] == 'class0': # return True if actual == calculated return 0,True return 0,False # for Non-Diabetic