def run_main(): file_df = pd.read_csv('../dataset/voice.csv') # print file_df insect_dataset(file_df) #填充空数据 drop_na(file_df) #查看label的个数 分组显示 # print file_df['label'].value_counts() #特征分布可视化 fea_name1 = 'meanfun' fea_name2 = 'centroid' #两个属性的特征图 # visaulize_two_feature(file_df,fea_name1,fea_name2) #艺术性属性的特征图 # visaulize_single_feature(file_df,fea_name1) #多个特征 fea_name = ['meanfreq', 'Q25', 'Q75', 'skew', 'centroid', 'label'] # visaulize_muilt_feature(file_df,fea_name) X = file_df.iloc[:, :-1].values file_df['label'].replace('male', 0, inplace=True) file_df['label'].replace('female', 1, inplace=True) y = file_df['label'].values #特征归一化 X = preprocessing.scale(X) #分割训练集,测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=5) #选择模型 交叉验证 cv_scores = [] k_range = range(1, 31) for k in k_range: knn = KNeighborsClassifier(k) # print 'knn:',knn scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy') score_mean = scores.mean() cv_scores.append(score_mean) print '%i:%.4f' % (k, score_mean) best_k = np.argmax(cv_scores) + 1 #训练模型 knn_model = KNeighborsClassifier(best_k) knn_model.fit(X_train, y_train) print '测试模型,准确率:', knn_model.score(X_test, y_test) return ''
def get_best_k(X, y, max_k=30, keep_best_n=10, weights=None): # TODO: check X, y. description # Set default values if max_k is None: max_k = len(X) if weights is None: weights = ['uniform', 'distance'] # Make weights into a list if it is not already one if type(weights) is not list: weights = [weights] # Check if inputs are valid check_pandas_dataframe_nd(X, 'X') check_numpy_array_pandas_dataframe_series_1d(y, 'y') check_list_of_strings(weights, 'weights') check_integer(max_k, 'max_k') check_larger(max_k, 'max_k', 1) check_integer(keep_best_n, 'keep_best_n') check_larger(keep_best_n, 'keep_best_n', 1) # Change shape of y if necessary y = np.array(y) y = y.ravel() # Split into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # Get value for max_k max_k = min(max_k, len(X_test)) # Set up results-list best_model = [] for k in range(1, max_k): for weight in weights: model = KNeighborsClassifier(n_neighbors=k, weights=weight).fit(X_train, y_train) score = model.score(X_test, y_test) best_model.append((k, weight, score)) best_model.sort(key=lambda x: x[2], reverse=True) best_model = best_model[0:keep_best_n] return best_model
import matplotlib.pyplot as plt import mglearn cancer = load_breast_cancer() #print(cancer.data) # X : data #print(cancer.target_names) # y : label X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify = cancer.target, random_state=66) training_accuracy = [] test_accuracy = [] # 1 ~ 10까지 n_neighbors를 적용 neighbors_setting = range(1,11) for n_neighbors in neighbors_setting: # 모델 생성 clf = KNeighborsClassifier(n_neighbors= n_neighbors) clf.fit(X_train, y_train) # train 세트 정확도 저장 training_accuracy.append(clf.score(X_train, y_train)) # 일반화 정확도 저장 test_accuracy.append(clf.score(X_test, y_test)) plt.plot(neighbors_setting, training_accuracy, label='traing accuracy') plt.plot(neighbors_setting, test_accuracy, label='test accuracy') plt.ylabel('Accuracy') plt.xlabel('n_neighbors') plt.legend() plt.show()