# -*- encoding: utf-8 -*- """ 8.8.1 估计器得分 """ # 分类估计器 from sklearn.datasets import load_iris from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split as tsplit from sklearn.metrics import accuracy_score X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = tsplit(X, y, test_size=0.1) knn = KNeighborsClassifier() knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print(accuracy_score(y_test, y_pred)) # 使用准确性指标评价函数 print(knn.score(X_test, y_test)) # 直接使用测试集对训练效果做出准确性评价 # 回归估计器 from sklearn.svm import SVR from sklearn.model_selection import train_test_split as tsplit from sklearn import metrics X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = tsplit(X, y, test_size=0.1) svr = SVR() svr.fit(X_train, y_train)
def build_model(X, W, dropout=0): h = X for wi in W[:-1]: h = tf.nn.dropout(tf.nn.relu(tf.matmul(h, wi)),1.0-dropout) return tf.nn.relu(tf.matmul(h,W[-1])) # initialization os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' shutil.rmtree('log',ignore_errors=True) tf.reset_default_graph() # load the data trX,trY,trId = get_data('data/train.csv') teX,teY,teId = get_data('data/test.csv',loady=False) trX,vaX,trY,vaY,trId,vaId = tsplit(trX,trY,trId,test_size=0.3) # define the parameters size_in = trX.shape[1] size_out = trY.shape[1] size_h = [size_in,512,64,size_out] batch_size = 128 # define tf symbolic variables X = tf.placeholder("float", [None, size_in]) Y = tf.placeholder("float", [None, size_out]) W = [init_weights([hi,ho]) for hi,ho in zip(size_h[:-1],size_h[1:])] # define the model & operations Yt = build_model(X, W, dropout=0.3) Yp = build_model(X, W, dropout=0.0)
movies["budgetReturn"] = movies["gross"] / movies["budget"] #Creating a column 'isLong' which defines whether or not a movie is a long #(greater than the standard 120 minutes) movies["isLong"] = np.where((movies["runtime"] > 120), 1,0) #Creating an analagous column 'isShort' which defines whether or not a movie is # short (less than the standard 75 minutes minimum) movies["isShort"] = np.where((movies["runtime"] < 75),1,0) #Splitting Data into training and testing movies_train, movies_test = tsplit(movies, test_size = 0.2, random_state = 1) #Moving columns to more appropriate locations movies = movies[['name','genre', 'rating' , 'country', 'company', 'isMajor', 'released', 'year', 'runtime', 'isLong', 'isShort', 'score', 'votes', 'star', 'director', 'budget', 'gross', 'budgetReturn']] #Removing duplicates, none found with below function movies.duplicated() #Splitting data into features (indepedent variables) and response #(dependet variable) features1 = movies.loc[:,"name":"budget"] features2 = movies.loc[:,"budgetReturn"] features = pd.concat([features1, features2],axis = 1)
# -*- encoding: utf-8 -*- """ 8.4.2 贝叶斯分类 """ from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF向量 from sklearn.naive_bayes import MultinomialNB # 导入多项式分布的朴素贝叶斯模型 from sklearn.model_selection import train_test_split as tsplit from sklearn.metrics import classification_report # 导入分类结果评估报告函数 X, y = fetch_20newsgroups(return_X_y=True) # 获取新闻数据集和分类标签集 vectorizer = TfidfVectorizer() vdata = vectorizer.fit_transform(X) # 文本转为TF-IDF向量 x_train, x_test, y_train, y_test = tsplit(vdata, y, test_size=0.1) m = MultinomialNB() # 实例化多项式分布的朴素贝叶斯分类器 m.fit(x_train, y_train) # 模型训练 precision = m.score(x_test, y_test) print('测试集分类准确率:%0.2f' % precision) y_pred = m.predict(x_test) report = classification_report(y_test, y_pred) print('测试集分类结果报告:\n', report)