def run_main():

    file_df = pd.read_csv('../dataset/voice.csv')
    #   print file_df
    insect_dataset(file_df)
    #填充空数据
    drop_na(file_df)
    #查看label的个数    分组显示
    #   print file_df['label'].value_counts()
    #特征分布可视化
    fea_name1 = 'meanfun'
    fea_name2 = 'centroid'

    #两个属性的特征图
    # visaulize_two_feature(file_df,fea_name1,fea_name2)

    #艺术性属性的特征图
    # visaulize_single_feature(file_df,fea_name1)

    #多个特征
    fea_name = ['meanfreq', 'Q25', 'Q75', 'skew', 'centroid', 'label']
    # visaulize_muilt_feature(file_df,fea_name)

    X = file_df.iloc[:, :-1].values
    file_df['label'].replace('male', 0, inplace=True)
    file_df['label'].replace('female', 1, inplace=True)
    y = file_df['label'].values

    #特征归一化
    X = preprocessing.scale(X)

    #分割训练集,测试集
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=1 / 3.,
                                                        random_state=5)

    #选择模型  交叉验证
    cv_scores = []
    k_range = range(1, 31)
    for k in k_range:
        knn = KNeighborsClassifier(k)
        #  print 'knn:',knn
        scores = cross_val_score(knn,
                                 X_train,
                                 y_train,
                                 cv=10,
                                 scoring='accuracy')
        score_mean = scores.mean()
        cv_scores.append(score_mean)
        print '%i:%.4f' % (k, score_mean)

    best_k = np.argmax(cv_scores) + 1

    #训练模型
    knn_model = KNeighborsClassifier(best_k)
    knn_model.fit(X_train, y_train)
    print '测试模型,准确率:', knn_model.score(X_test, y_test)

    return ''
Exemple #2
0
def get_best_k(X, y, max_k=30, keep_best_n=10, weights=None):

    # TODO: check X, y. description

    # Set default values
    if max_k is None:
        max_k = len(X)

    if weights is None:
        weights = ['uniform', 'distance']

    # Make weights into a list if it is not already one
    if type(weights) is not list:
        weights = [weights]

    # Check if inputs are valid
    check_pandas_dataframe_nd(X, 'X')

    check_numpy_array_pandas_dataframe_series_1d(y, 'y')

    check_list_of_strings(weights, 'weights')

    check_integer(max_k, 'max_k')
    check_larger(max_k, 'max_k', 1)

    check_integer(keep_best_n, 'keep_best_n')
    check_larger(keep_best_n, 'keep_best_n', 1)

    # Change shape of y if necessary
    y = np.array(y)
    y = y.ravel()

    # Split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    # Get value for max_k
    max_k = min(max_k, len(X_test))

    # Set up results-list
    best_model = []

    for k in range(1, max_k):
        for weight in weights:
            model = KNeighborsClassifier(n_neighbors=k,
                                         weights=weight).fit(X_train, y_train)
            score = model.score(X_test, y_test)
            best_model.append((k, weight, score))

    best_model.sort(key=lambda x: x[2], reverse=True)
    best_model = best_model[0:keep_best_n]
    return best_model
Exemple #3
0
import matplotlib.pyplot as plt
import mglearn

cancer = load_breast_cancer()
#print(cancer.data)            # X : data
#print(cancer.target_names)    # y : label
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,
                                                    stratify = cancer.target, random_state=66)

training_accuracy = []
test_accuracy = []

# 1 ~ 10까지 n_neighbors를 적용
neighbors_setting = range(1,11)

for n_neighbors in neighbors_setting:
    # 모델 생성
    clf = KNeighborsClassifier(n_neighbors= n_neighbors)
    clf.fit(X_train, y_train)
    # train 세트 정확도 저장
    training_accuracy.append(clf.score(X_train, y_train))
    # 일반화 정확도 저장
    test_accuracy.append(clf.score(X_test, y_test))

plt.plot(neighbors_setting, training_accuracy, label='traing accuracy')
plt.plot(neighbors_setting, test_accuracy, label='test accuracy')
plt.ylabel('Accuracy')
plt.xlabel('n_neighbors')
plt.legend()
plt.show()