from sklearn.metrics import mean_absolute_error '''加载数据集''' boston=datasets.load_boston() X=boston.data[:,5]#只选取第5列——房间数量 y=boston.target # plt.scatter(X,y) # plt.show() '''会有很多超出上限的点,需要去除''' X=X[y<50.0] y=y[y<50.0] # plt.scatter(X,y) # plt.show() '''进行测试训练分割''' X_train, X_test, y_train, y_test = train_test_split(X,y,test_ratio=0.2,seed=666) reg2=SimpleLinearRegression2() reg2.fit(X_train,y_train) print(reg2.a_) print(reg2.b_) #看看训练怎么样 plt.scatter(X_train,y_train) # plt.plot(X_train,reg2.predict(X_train),color='red') # plt.show() #预测一波 y_predict = reg2.predict(X_test) """scikit learn里面MSE与MAE""" mean_squared_error(y_test,y_predict) mean_absolute_error(y_test,y_predict)
#encoder = encoding.encode_gerente(df_treino,'numerical') #df_treino.loc[:,'Gerente_Negocio'] = encoder.fit_transform(df_treino['Gerente_Negocio']) df_treino = tratamento.fix_unidade_e_area(df_treino) df_treino = tratamento.add_features(df_treino) df_treino = tratamento.drop_unecessary(df_treino) df_treino = tratamento.get_cidade_e_is_privado(df_treino) df_treino = tratamento.get_objetivo_client(df_treino) df_treino = encoding.encoding_foco(df_treino) df_treino.loc[:, 'Area_Unidade_Negocio2'] = df_treino['Area_Unidade_Negocio'] df_treino = encoding.get_dummies(df_treino) df_treino1 = df_treino[df_treino.Resultado == 1] X_train, X_test, y_train, y_test = model_selection.train_test_split( df_treino1, res='ResultadoPrevisao') all_feats = [ f for f in X_train.columns if f not in [ 'Id_ON', 'Id_Cadencia', 'Area_Unidade_Negocio2', 'ResultadoPrevisao', 'TrimestreResultado' ] ] clf = RandomForestClassifier(n_estimators=500, max_features=1, random_state=30) selected_features = all_feats #model_selection.feature_selection(clf,X_train[all_feats],y_train,k=40) model = GradientBoostingClassifier(max_features=0.7, random_state=10) model.fit(X_train[selected_features], y_train) model.fit(df_treino1[selected_features], df_treino1.ResultadoPrevisao) df = X_test.copy() predictions = model.predict(df[selected_features])
''' 4-3使用封装好的包来调用 ''' from functions.model_selection import train_test_split from functions.kNN import KNNClassifier from sklearn import datasets import numpy as np iris = datasets.load_iris() X = iris.data #特征矩阵 y = iris.target #结果标签向量 X_train, X_test, y_train, y_test = train_test_split(X, y) my_knn_clf = KNNClassifier(k=3) my_knn_clf.fit(X_train, y_train) y_predict = my_knn_clf.predict(X_test) #有30个预测结果看看和y_test有的不一样 ''' 计算模型的准确率我的直白方法vs老师的简练方法 ''' #count=0 # for i in range(len(y_predict)): # if y_predict[i]==y_test[i]: # count+=1 # print(count*100/len(y_test)) sum(y_predict == y_test) / len( y_test) #会返回一个全是bool的列表,true为1false为0加起来就是预测对的的数目之后除以总数
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets iris = datasets.load_iris() X = iris.data y = iris.target """二分类怎么解决三分类的鸢尾花?只选前两类和两个特征(画图方便)""" X = X[y < 2, :2] y = y[y < 2] """使用自己的方法""" from functions.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666) from functions.LogisticRegression import LogisticRegression log_reg = LogisticRegression() log_reg.fit(X_train, y_train) def x2(x1): return (-log_reg.coef_[0] * x1 - log_reg.intercept_) / log_reg.coef_[1] x1_plot = np.linspace(4, 8, 1000) x2_plot = x2(x1_plot) plt.plot(x1_plot, x2_plot) plt.scatter(X[y == 0, 0], X[y == 0, 1], color="red") plt.scatter(X[y == 1, 0], X[y == 1, 1], color="blue") plt.show()