Exemple #1
0
def fit(train_percentage,fold=5):
    """ Radomly choose songs from the dataset, and train the classfier 
        Accepts parameter: train_percentage, fold;
        Returns clf    
    """
    
    resTrain =0
    resTest = 0
    score = 0
    scores = 0

    for folds in range(fold):
        #init
        flag = True
        flag_train = True
        start = 0
        train_matrix = np.array([])
        test_matrix = np.array([])
        Xindex = []
        Tindex = []

        for class_counter in range(10):
            stack = list(range(start, start+100))  #create an index of size 100
            for song_counter in range( int(train_percentage) ):
                index = random.choice(stack)      #randomly choose numbers from index
                stack.remove(index)               #remove the choosen number from index
                random_song = Xall[index]         #select songs from that index for training
                Xindex.append(index)
                if flag:
                    train_matrix = random_song
                    flag = False
                else:
                    train_matrix = np.vstack([train_matrix, random_song])
            start += 100

            #select the remaning songs from the stack for testing
            for test_counter in range(100 - train_percentage):
                Tindex.append(stack[test_counter])
                if flag_train:
                    test_matrix = Xall[stack[test_counter]]
                    flag_train = False
                else:
                    test_matrix = np.vstack([test_matrix, Xall[stack[test_counter]]])

        Y = feature.geny(train_percentage) 
        y = feature.geny(100 - train_percentage)

        clf = svm.SVC(kernel='poly',C=1,probability=True)
        clf.fit(train_matrix, Y)
        #training accuracy
        res = clf.predict(test_matrix)
        ac = acc.get(res,y)
        print("accuracy = ", ac)
        return ac, clf
Exemple #2
0
def 多项式模型(X, Y):
    """进行模型训练,并且计算训练集上预测值与label的准确性
    """
    clf = svm.SVC(kernel='poly',
                  C=0.1,
                  probability=True,
                  decision_function_shape='ovo',
                  random_state=0)
    clf.fit(X, Y)
    res = clf.predict(X)
    restrain = acc.get(res, Y)
    return clf, restrain  #返回模型及预测准确度
Exemple #3
0
def poly(X, Y):
    """
    进行svm模型训练,并返回最终构建好的模型及在训练集上的正确率
    """
    # 模型构建
    clf = svm.SVC(kernel='poly', C=1, probability=True,decision_function_shape = 'ovo', random_state=0)
    # 模型训练
    clf.fit(X, Y)

    # 使用模型预测训练集得到正确率
    res = clf.predict(X)
    resTrain = acc.get(res, Y)

    return clf, resTrain
Exemple #4
0
def 多次训练并保存模型(train_percentage=0.7,
              fold=1,
              music_csv_file_path=None,
              model_out_f=None):  #fit_dump_model
    """pass"""
    if not music_csv_file_path:
        music_csv_file_path = 歌曲特征文件存放路径
    data = pd.read_csv(music_csv_file_path,
                       sep=',',
                       header=None,
                       encoding='utf-8')

    max_train_source = None
    max_test_source = None
    max_source = None
    best_clf = None
    flag = True
    for index in range(1, int(fold) + 1):
        print(index)
        shuffle_data = shuffle(data)  #特征
        X = shuffle_data.T[:-1].T
        Y = np.array(shuffle_data.T[-1:])[0]
        x_train, x_test, y_train, y_test = train_test_split(
            X, Y, train_size=train_percentage)  #并未制定随机种子
        (clf, train_source) = 多项式模型(x_train, y_train)  #返回的是模型及训练集上的准确率
        y_predict = clf.predict(x_test)
        test_source = acc.get(y_predict, y_test)  #测试集的准确率
        source = 0.35 * train_source + 0.65 * test_source  #模型综合准确率
        if flag:
            max_source = source
            max_train_source = train_source
            max_test_source = test_source
            best_clf = clf
            flag = False
        else:
            if max_source < source:
                max_source = source
                max_train_source = train_source
                max_test_source = test_source
                best_clf = clf
        print('第%d次训练,训练集上的正确率为:%0.2f, 测试集上正确率为:%0.2f,加权平均正确率为:%0.2f'%(index , train_source,\
                                                                       test_source, source ))
    print('最优模型效果:训练集上的正确率为:%0.2f,测试集上的正确率为:%0.2f, 加权评卷正确率为:%0.2f'%(max_train_source,\
                                                                     max_test_source, max_source))
    print('最优模型是:')
    print(best_clf)
    if not model_out_f:
        model_out_f = 模型保存路径
    joblib.dump(best_clf, model_out_f)
Exemple #5
0
        Y = feature.geny(train_percentage) 
        y = feature.geny(100 - train_percentage)
        #training accuracy
        clf = svm.SVC(kernel='poly',C=1)
        clf.fit(train_matrix, Y)
        #train case
        #scores = cross_val_score(clf, train_matrix, feature.geny(train_percentage), cv=5)
        #print("acc train",scores.mean())
        #test case
        #scores = cross_val_score(clf, test_matrix, feature.geny(100 - train_percentage), cv=5)
        #print("acc test",scores.mean())
>>>>>>> ce120f1d2dfdc28237def03235925ff37c879e8b
        res = clf.predict(train_matrix)
        #print(acc.get(res,Y))
        resTrain += acc.get(res,Y)
        res = clf.predict(test_matrix)
        resTest += acc.get(res,y)
<<<<<<< HEAD
    
    print("Training accuracy with %d fold %f: " % (int(fold), resTrain / int(fold)))
    print("Testing accuracy with %d fold %f: " % (int(fold), resTest / int(fold)))
    

=======
        #print(acc.get(res,y))
        #return test_matrix, train_matrix
        #scores = cross_val_score(clf, test_matrix, y, cv=5)
        #print("Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
        
        #score = 100 *clf.score(test_matrix,y)
Exemple #6
0
def fit_dump_model(train_percentage=0.7, fold=1, music_csv_file_path=None, model_out_f=None):
    """
    进行fold次训练,将准确率最高的模型输出到文件中
    train_percentage: 训练过程中,训练数据集的比例,范围: (0,1)
    fold: 训练的次数, 必须为大于0的整数
    music_csv_file_path: 数据存储文件路径
    NOTE: 使用训练集上的准确率和测试集上的准确率之间的和作为最终的准确率的评定指标; 计算公式为: source = 0.35*train_source + 0.65*test_source
    """
    # 1. 数据读取
    if not music_csv_file_path:
        music_csv_file_path = default_music_csv_file_path
    data = pd.read_csv(music_csv_file_path, sep=',', header=None, encoding='utf-8')

    # 2. 进行循环处理
    max_train_source = None
    max_test_source = None
    max_source = None
    best_clf = None
    flag = True
    for index in range(1, int(fold) + 1):
        # 2.1 进行数据抽取/数据分隔
        shuffle_data = shuffle(data)
        X = shuffle_data.T[:-1].T
        Y = np.array(shuffle_data.T[-1:])[0]
        x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=train_percentage)


        # 2.2 模型训练
        (clf, train_source) = poly(x_train, y_train)

        # 2.3 获取得到测试集正确率, 并计算最终模型的正确率
        y_predict = clf.predict(x_test)
        test_source = acc.get(y_predict, y_test)
        source = 0.35 * train_source + 0.65 * test_source

        # 2.4 将最大的模型保存
        if flag:
            max_source = source
            max_train_source = train_source
            max_test_source = test_source
            best_clf = clf
            flag = False
        else:
            if max_source < source:
                max_source = source
                max_train_source = train_source
                max_test_source = test_source
                best_clf = clf

        # 2.5 将source输出
        print("第%d次训练,测试集上正确率为: %.2f, 训练集上正确率为:%.2f, 加权平均正确率为: %.2f" % (index, train_source, test_source, source))

    # 3. 进行最优模型输出
    print("最优模型效果:测试集上正确率为: %.2f, 训练集上正确率为:%.2f, 加权平均正确率为: %.2f" % (max_train_source, max_test_source, max_source))
    print("*" * 5 + "最优模型" + "*" * 5)
    print(best_clf)
    print("*" * 15)

    # 4. 模型输出
    if not model_out_f:
        model_out_f = default_model_file_path
    joblib.dump(best_clf, model_out_f)