def fit(train_percentage,fold=5): """ Radomly choose songs from the dataset, and train the classfier Accepts parameter: train_percentage, fold; Returns clf """ resTrain =0 resTest = 0 score = 0 scores = 0 for folds in range(fold): #init flag = True flag_train = True start = 0 train_matrix = np.array([]) test_matrix = np.array([]) Xindex = [] Tindex = [] for class_counter in range(10): stack = list(range(start, start+100)) #create an index of size 100 for song_counter in range( int(train_percentage) ): index = random.choice(stack) #randomly choose numbers from index stack.remove(index) #remove the choosen number from index random_song = Xall[index] #select songs from that index for training Xindex.append(index) if flag: train_matrix = random_song flag = False else: train_matrix = np.vstack([train_matrix, random_song]) start += 100 #select the remaning songs from the stack for testing for test_counter in range(100 - train_percentage): Tindex.append(stack[test_counter]) if flag_train: test_matrix = Xall[stack[test_counter]] flag_train = False else: test_matrix = np.vstack([test_matrix, Xall[stack[test_counter]]]) Y = feature.geny(train_percentage) y = feature.geny(100 - train_percentage) clf = svm.SVC(kernel='poly',C=1,probability=True) clf.fit(train_matrix, Y) #training accuracy res = clf.predict(test_matrix) ac = acc.get(res,y) print("accuracy = ", ac) return ac, clf
def 多项式模型(X, Y): """进行模型训练,并且计算训练集上预测值与label的准确性 """ clf = svm.SVC(kernel='poly', C=0.1, probability=True, decision_function_shape='ovo', random_state=0) clf.fit(X, Y) res = clf.predict(X) restrain = acc.get(res, Y) return clf, restrain #返回模型及预测准确度
def poly(X, Y): """ 进行svm模型训练,并返回最终构建好的模型及在训练集上的正确率 """ # 模型构建 clf = svm.SVC(kernel='poly', C=1, probability=True,decision_function_shape = 'ovo', random_state=0) # 模型训练 clf.fit(X, Y) # 使用模型预测训练集得到正确率 res = clf.predict(X) resTrain = acc.get(res, Y) return clf, resTrain
def 多次训练并保存模型(train_percentage=0.7, fold=1, music_csv_file_path=None, model_out_f=None): #fit_dump_model """pass""" if not music_csv_file_path: music_csv_file_path = 歌曲特征文件存放路径 data = pd.read_csv(music_csv_file_path, sep=',', header=None, encoding='utf-8') max_train_source = None max_test_source = None max_source = None best_clf = None flag = True for index in range(1, int(fold) + 1): print(index) shuffle_data = shuffle(data) #特征 X = shuffle_data.T[:-1].T Y = np.array(shuffle_data.T[-1:])[0] x_train, x_test, y_train, y_test = train_test_split( X, Y, train_size=train_percentage) #并未制定随机种子 (clf, train_source) = 多项式模型(x_train, y_train) #返回的是模型及训练集上的准确率 y_predict = clf.predict(x_test) test_source = acc.get(y_predict, y_test) #测试集的准确率 source = 0.35 * train_source + 0.65 * test_source #模型综合准确率 if flag: max_source = source max_train_source = train_source max_test_source = test_source best_clf = clf flag = False else: if max_source < source: max_source = source max_train_source = train_source max_test_source = test_source best_clf = clf print('第%d次训练,训练集上的正确率为:%0.2f, 测试集上正确率为:%0.2f,加权平均正确率为:%0.2f'%(index , train_source,\ test_source, source )) print('最优模型效果:训练集上的正确率为:%0.2f,测试集上的正确率为:%0.2f, 加权评卷正确率为:%0.2f'%(max_train_source,\ max_test_source, max_source)) print('最优模型是:') print(best_clf) if not model_out_f: model_out_f = 模型保存路径 joblib.dump(best_clf, model_out_f)
Y = feature.geny(train_percentage) y = feature.geny(100 - train_percentage) #training accuracy clf = svm.SVC(kernel='poly',C=1) clf.fit(train_matrix, Y) #train case #scores = cross_val_score(clf, train_matrix, feature.geny(train_percentage), cv=5) #print("acc train",scores.mean()) #test case #scores = cross_val_score(clf, test_matrix, feature.geny(100 - train_percentage), cv=5) #print("acc test",scores.mean()) >>>>>>> ce120f1d2dfdc28237def03235925ff37c879e8b res = clf.predict(train_matrix) #print(acc.get(res,Y)) resTrain += acc.get(res,Y) res = clf.predict(test_matrix) resTest += acc.get(res,y) <<<<<<< HEAD print("Training accuracy with %d fold %f: " % (int(fold), resTrain / int(fold))) print("Testing accuracy with %d fold %f: " % (int(fold), resTest / int(fold))) ======= #print(acc.get(res,y)) #return test_matrix, train_matrix #scores = cross_val_score(clf, test_matrix, y, cv=5) #print("Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #score = 100 *clf.score(test_matrix,y)
def fit_dump_model(train_percentage=0.7, fold=1, music_csv_file_path=None, model_out_f=None): """ 进行fold次训练,将准确率最高的模型输出到文件中 train_percentage: 训练过程中,训练数据集的比例,范围: (0,1) fold: 训练的次数, 必须为大于0的整数 music_csv_file_path: 数据存储文件路径 NOTE: 使用训练集上的准确率和测试集上的准确率之间的和作为最终的准确率的评定指标; 计算公式为: source = 0.35*train_source + 0.65*test_source """ # 1. 数据读取 if not music_csv_file_path: music_csv_file_path = default_music_csv_file_path data = pd.read_csv(music_csv_file_path, sep=',', header=None, encoding='utf-8') # 2. 进行循环处理 max_train_source = None max_test_source = None max_source = None best_clf = None flag = True for index in range(1, int(fold) + 1): # 2.1 进行数据抽取/数据分隔 shuffle_data = shuffle(data) X = shuffle_data.T[:-1].T Y = np.array(shuffle_data.T[-1:])[0] x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=train_percentage) # 2.2 模型训练 (clf, train_source) = poly(x_train, y_train) # 2.3 获取得到测试集正确率, 并计算最终模型的正确率 y_predict = clf.predict(x_test) test_source = acc.get(y_predict, y_test) source = 0.35 * train_source + 0.65 * test_source # 2.4 将最大的模型保存 if flag: max_source = source max_train_source = train_source max_test_source = test_source best_clf = clf flag = False else: if max_source < source: max_source = source max_train_source = train_source max_test_source = test_source best_clf = clf # 2.5 将source输出 print("第%d次训练,测试集上正确率为: %.2f, 训练集上正确率为:%.2f, 加权平均正确率为: %.2f" % (index, train_source, test_source, source)) # 3. 进行最优模型输出 print("最优模型效果:测试集上正确率为: %.2f, 训练集上正确率为:%.2f, 加权平均正确率为: %.2f" % (max_train_source, max_test_source, max_source)) print("*" * 5 + "最优模型" + "*" * 5) print(best_clf) print("*" * 15) # 4. 模型输出 if not model_out_f: model_out_f = default_model_file_path joblib.dump(best_clf, model_out_f)