def main(): data = readxls.excel_table_byname( "/Users/JJjie/Desktop/www/Mechine_Learning/dataset/西瓜3.0.xlsx", 0, "Sheet1") y = data[3] x = mat(data[0:2]) u = mat(zeros((2, 2))) # 计算均值 for i in range(17): index = int(y[i]) u[:, index] = u[:, index] + x[:, i] u[:, 0] /= 8 u[:, 1] /= 9 # 计算两类协方差矩阵和 sw = zeros((2, 2)) for i in range(17): index = int(y[i]) temp = (x[:, i] - u[:, index]) sw = sw + temp * temp.T # 求逆 # U, S, V = linalg.svd(sw) # 奇异值分解 # V / S * U.T 为逆 B = linalg.inv(sw) w = B * (u[:, 0] - u[:, 1]) # 绘图 plt.title("LDA") plt.xlabel("Denisty") plt.ylabel("Sguar content") x1 = [] y1 = [] x2 = [] y2 = [] index = 0 for i in data[3]: if i == 1.0: x1.append(data[0][index]) y1.append(data[1][index]) else: x2.append(data[0][index]) y2.append(data[1][index]) index += 1 plt.plot(x1, y1, 'ro', label="Good") plt.plot(x2, y2, 'og', label="Bad") W = w.T.A[0] pl = -(0.2 * W[0] - 0.01) / W[1] pr = -(0.8 * W[0] - 0.01) / W[1] plt.plot([0.2, 0.8], [pl, pr]) plt.legend() plt.show()
def main(): # 数据准备 data = readxls.excel_table_byname( "/Users/JJjie/Desktop/www/Mechine_Learning/dataset/西瓜3.0.xlsx", 0, "Sheet1") x = mat(data[0:3]).T y = mat(data[3]).T b = logarithmic_regression(x, y, 2) draw(data, b, data[3], "Logarithmic regression", "Denisty", "Sguar content")
while (1): cur_l = 0 bx = np.zeros((17, 1)) bx = np.dot(b.T, k) cur_l = sum((-y * bx[0][:17]) + np.log(1 + np.exp(bx[0][:17]))) if cur_l - old_l < 0.001: break n += 1 old_l = cur_l p1 = np.zeros((17, 1)) dl = 0 d2l = 0 for i in range(17): p1[i] = 1 - 1 / (1 + np.exp(bx[0][i])) dl -= k[:, i] * (y[i] - p1[i]) d2l += np.dot(k[:, i], k[:, i].T) * p1[i] * (1 - p1[i]) b = b - d2l / dl if __name__ == '__main__': data = readxls.excel_table_byname( "/Users/JJjie/Desktop/www/Mechine_Learning/dataset/西瓜3.0.xlsx", 0, "Sheet1") x = np.array(data[0:2]) y = np.array(data[3]) run(x, y)
# @Author : UNE # @Project : Mechine_learning # @File : K_means_pro.py # @Software: PyCharm # 《机器学习》(周志华)第九章9.10 """ 实现一种能自动确定聚类数的改进k均值算法,编程实现并在西瓜数据集上运行。 """ from tool import readxls import numpy as np import matplotlib.pyplot as plt if __name__ == '__main__': data = readxls.excel_table_byname( "/Users/JJjie/Desktop/Projects/Mechine_Learning/dataset/西瓜4.xlsx", 0, "Sheet1") data = np.array(data) (m, n) = data.shape old_ts = 100 # 当前最低的平方误差,初始设置为一个很大的值 old_c = 0 old_nums = 0 for k in range(2, 10): u = data[np.random.randint(30, size=k), :] # 产生随机均值 while 1: c = np.zeros((k, 30), dtype="int64") # 将各类集合清空 nums = np.zeros((k, 1), dtype="int64") # 对所有样本遍历,选择最近集合
# @Author : UNE # @Site : # @File : AdaBoosw.py # @Software: PyCharm # 《机器学习》(周志华)第八章8.3 """ 编程实现AdaBoosw,以不剪枝决策树为基学习器,在西瓜数据集3.0å上训练一个AdaBoosw集成,并于图8.4作比较 """ from tool import readxls import numpy as np import pandas as pd from dTree import dTree if __name__ == '__main__': data = readxls.excel_table_byname( "/Users/JJjie/Desktop/Projects/dataset/西瓜3.xlsx", 0, "Sheet1") x = pd.DataFrame(data[6:8]) y = pd.DataFrame(data[8]) y = y.T y_index = y - 1 y = -2 * y + 3 # 将y映射到1,-1 try: # 一维数组的情况 m, n = y.shape except: m = 1 n = len(y) set = np.arange(0, n) sy = np.zeros((1, 17)) # 记录累积分类器的分类 sw = np.ones((1, 17)) / 17 # 样本的权值,初始相同