def Manhattan_classify(train, test2): '''用Manhattan对测试集分类''' total = data_process.count_total(test2) K = 1 a = np.array(test2) t_data = a[:, 0:4] predict = [] # 对测试集各样本进行分类 for sample in t_data: sort = Manhattan_distance(train, sample) lab_0 = 0 lab_1 = 0 for i in range(K): if sort[i][1] == 0: lab_0 += 1 else: lab_1 += 1 if lab_0 > lab_1: predict.append(0) else: predict.append(1) Manhattan_result = list(test2) for i in range(total): Manhattan_result[i][4] = predict[i] title = [ 'Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)', 'Time (months)', 'My prediction' ] Manhattan_result.insert(0, title) submit2 = pd.DataFrame(data=Manhattan_result) #print(submit) submit2.to_csv('./task1_test_Manhattan.csv', encoding='gbk', header=None, index=None)
def Chebyshev_distance(train_data, data): '''计算单个样本与训练集各个样本间的Chebyshev_distance,并按升序排列''' a1 = np.array(train_data) a2 = np.array(data) td = a1[:, 0:4] d = a2[0:4] label = a1[:, 4] C_distance = [] dis_lab = [] #存储该样本与训练样本的Chebyshev_distance和训练样本对应的类别 for sample in td: norm = max(abs(sample - d)) C_distance.append(norm) for i in range(data_process.count_total(train_data)): dis_lab.append([C_distance[i], label[i]]) sort = sorted(dis_lab, key=(lambda x: x[0])) #将Chebyshev_distance按升序排列 return sort
def Euclidean_distance(train_data, data): '''计算单个样本与训练集各个样本间的Euclidean_distance,并按升序排列''' a1 = np.array(train_data) a2 = np.array(data) td = a1[:, 0:4] d = a2[0:4] label = a1[:, 4] E_distance = [] dis_lab = [] # 存储该样本与训练样本的欧式距离和训练样本对应的类别 for sample in td: norm = np.linalg.norm(sample - d) E_distance.append(norm) for i in range(data_process.count_total(train_data)): dis_lab.append([E_distance[i], label[i]]) sort = sorted(dis_lab, key=(lambda x: x[0])) # 将欧氏距离按升序排列 return sort
def Manhattan_distance(train_data, data): '''计算单个样本与训练集各个样本间的Manhattan_distance,并按升序排列''' a1 = np.array(train_data) a2 = np.array(data) td = a1[:, 0:4] d = a2[0:4] label = a1[:, 4] C_distance = [] dis_lab = [] #存储该样本与训练样本的Manhattan_distance和训练样本对应的类别 for sample in td: temp = np.array([feature for feature in sample - d]) norm = sum(abs(temp)) C_distance.append(norm) for i in range(data_process.count_total(train_data)): dis_lab.append([C_distance[i], label[i]]) sort = sorted(dis_lab, key=(lambda x: x[0])) #将Manhattan_distance按升序排列 return sort
def Mahalanobis_distance(train_data, data): '''计算单个样本与训练集各个样本间的马氏距离,并按升序排列''' A1 = np.array(A) a1 = np.array(train_data) a2 = np.array(data) td = a1[:, 0:4] d = a2[0:4] label = a1[:, 4] C_distance = [] dis_lab = [] # 存储该样本与训练样本的马氏距离和训练样本对应的类别 for sample in td: temp = np.dot(d - sample, A1) temp_tr = np.transpose(temp) mul = np.dot(temp, temp_tr) norm = math.sqrt(mul) C_distance.append(norm) for i in range(data_process.count_total(train_data)): dis_lab.append([C_distance[i], label[i]]) sort = sorted(dis_lab, key=(lambda x: x[0])) # 将马氏距离按升序排列 return sort
def decide_label(train_data, val_data): '''用验证集计算各个K下的正确率''' total = data_process.count_total(val_data) a = np.array(val_data) data = a[:, 0:4] label = a[:, 4] corr = [] #各个K下的正确率 for K in range(1, 30, 2): predict = [] # 对验证集各样本进行分类 for sample in data: sort = Mahalanobis_distance(train_data, sample) lab_0 = 0 lab_1 = 0 for i in range(K): if sort[i][1] == 0: lab_0 += 1 else: lab_1 += 1 if lab_0 > lab_1: predict.append(0) else: predict.append(1) correct = 0 for i, lab in enumerate(label): if predict[i] == lab: correct += 1 corr.append(correct / total) #绘制K对精度影响的曲线图 K = list(range(1, 30, 2)) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) fig.suptitle('K of Mahalanobis', fontsize=14, fontweight='bold') ax.set_xlabel("K") ax.set_ylabel("correction rate") plt.plot(K, corr) plt.show() K2 = corr.index(max(corr)) * 2 + 1 #记录准确率最高的K return K2