Esempio n. 1
0
# -*- encoding: utf-8 -*-
"""
8.8.1 估计器得分
"""

# 分类估计器

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split as tsplit
from sklearn.metrics import accuracy_score

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = tsplit(X, y, test_size=0.1)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))  # 使用准确性指标评价函数
print(knn.score(X_test, y_test))  # 直接使用测试集对训练效果做出准确性评价

# 回归估计器

from sklearn.svm import SVR
from sklearn.model_selection import train_test_split as tsplit
from sklearn import metrics

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = tsplit(X, y, test_size=0.1)
svr = SVR()
svr.fit(X_train, y_train)
Esempio n. 2
0
def build_model(X, W, dropout=0):
  h = X
  for wi in W[:-1]:
    h = tf.nn.dropout(tf.nn.relu(tf.matmul(h, wi)),1.0-dropout)
  return tf.nn.relu(tf.matmul(h,W[-1]))

# initialization
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
shutil.rmtree('log',ignore_errors=True)
tf.reset_default_graph()

# load the data
trX,trY,trId = get_data('data/train.csv')
teX,teY,teId = get_data('data/test.csv',loady=False)
trX,vaX,trY,vaY,trId,vaId = tsplit(trX,trY,trId,test_size=0.3)

# define the parameters
size_in  = trX.shape[1]
size_out = trY.shape[1]
size_h   = [size_in,512,64,size_out]
batch_size = 128

# define tf symbolic variables
X  = tf.placeholder("float", [None, size_in])
Y  = tf.placeholder("float", [None, size_out])
W  = [init_weights([hi,ho]) for hi,ho in zip(size_h[:-1],size_h[1:])]

# define the model & operations
Yt         = build_model(X, W, dropout=0.3)
Yp         = build_model(X, W, dropout=0.0)
Esempio n. 3
0
movies["budgetReturn"] = movies["gross"] / movies["budget"]

#Creating a column 'isLong' which defines whether or not a movie is a long 
#(greater than the standard 120 minutes)

movies["isLong"] = np.where((movies["runtime"] > 120), 1,0)

#Creating an analagous column 'isShort' which defines whether or not a movie is 
# short (less than the standard 75 minutes minimum)

movies["isShort"] = np.where((movies["runtime"] < 75),1,0)


#Splitting Data into training and testing
movies_train, movies_test = tsplit(movies, test_size = 0.2, random_state = 1)

#Moving columns to more appropriate locations
movies = movies[['name','genre', 'rating' , 'country', 'company', 'isMajor',
                 'released', 'year', 'runtime', 'isLong', 'isShort',
                 'score', 'votes', 'star', 'director', 
                 'budget', 'gross', 'budgetReturn']]

#Removing duplicates, none found with below function
movies.duplicated()

#Splitting data into features (indepedent variables) and response 
#(dependet variable)
features1 = movies.loc[:,"name":"budget"]
features2 = movies.loc[:,"budgetReturn"]
features = pd.concat([features1, features2],axis = 1)
Esempio n. 4
0
# -*- encoding: utf-8 -*-
"""
8.4.2 贝叶斯分类
"""

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF向量
from sklearn.naive_bayes import MultinomialNB  # 导入多项式分布的朴素贝叶斯模型
from sklearn.model_selection import train_test_split as tsplit
from sklearn.metrics import classification_report  # 导入分类结果评估报告函数

X, y = fetch_20newsgroups(return_X_y=True)  # 获取新闻数据集和分类标签集
vectorizer = TfidfVectorizer()
vdata = vectorizer.fit_transform(X)  # 文本转为TF-IDF向量

x_train, x_test, y_train, y_test = tsplit(vdata, y, test_size=0.1)
m = MultinomialNB()  # 实例化多项式分布的朴素贝叶斯分类器
m.fit(x_train, y_train)  # 模型训练

precision = m.score(x_test, y_test)
print('测试集分类准确率:%0.2f' % precision)

y_pred = m.predict(x_test)
report = classification_report(y_test, y_pred)
print('测试集分类结果报告:\n', report)