Beispiel #1
0
def get_training_datas(method, all=True, replace=False):
    """
    Construct training, testing data, and kernels.
    :param: method: string, method used for computing kernels
    :param: replace: Boolean, whether or not replace the existing files in the repo
    :return:
        - X_train: pd.DataFrame, training sequences
        - X_val: pd.DataFrame, validation sequences
        - y_train: pd.DataFrame, training labels
        - y_val: pd.DataFrame, validation labels
        - X_test: pd.DataFrame, testing sequences
        - K: np.array, kernel
        - ID: np.array, Ids
    """
    file = 'training_data_'+method+'.pkl'
    if not all:
        X_train, y_train, X_val, y_val, X_test = train_test_split()
        X_test.loc[:, 'Id'] = -(X_test.loc[:, 'Id'] + 1)
        X = pd.concat((X_train, X_val, X_test), axis=0)
        ID = X.loc[:, 'Id']
    else:
        if trainInRepo(file) and not replace:
            X_train, y_train, X_val, y_val, X_test, K, ID = pkl.load(open(os.path.join(path, file), 'rb'))
        else:
            X_train, y_train, X_val, y_val, X_test = train_test_split()
            X_test.loc[:, 'Id'] = -(X_test.loc[:, 'Id']+1)
            X = pd.concat((X_train, X_val, X_test), axis=0)
            ID = np.array(X.loc[:, 'Id'])
            K = km.select_method(X, method)
            file = 'training_data_'+method+'.pkl'
            pkl.dump([X_train, y_train, X_val, y_val, X_test, K, ID], open(os.path.join(path, file), 'wb'))
    return X_train, y_train, X_val, y_val, X_test, K, ID
Beispiel #2
0
def grid_search(label,
                X,
                y,
                svm_params,
                methods,
                train_size=0.75,
                graph=False):
    """
    Implementation of the cross validation

    Parameters:
        - kernel: function, kernel function
        - label: int (0, 1 or 2), label of the set of data
        - X: array, observations
        - y: array, labels
        - svm_params: array, parameters of the SVM classifier
        - kernel_params: array, parameters of the kernel function
        - train_size: float (between 0 and 1), proportion of data for the train part
        - graph: bool, plot the evolution of the accuracy wrt log(svm_params) or not

    Returns the best SVM classifier
    """
    Xtr, ytr, Xte, yte = train_test_split(X, y, train_size)

    best_score = 0
    best_clf = None

    for method in methods:
        kernel, kernel_param = kernels.select_method(method)
        print()
        scores = []

        for c in svm_params:
            print('Parameters : ' + str([method, c]))
            gram_file = "../gram_matrix/gramMat_" + str(
                label) + "_" + method + ".p"
            clf = SVM.SupportVectorMachine(kernel=kernel,
                                           C=c,
                                           kernel_params=kernel_param)
            clf.fit(Xtr, ytr, gram_file)
            score = accuracy(clf.predict(Xte), yte)
            if score > best_score:
                best_score = score
                best_clf = clf
            print("Accuracy score = " + str(score) + '\n')
            scores.append(score)

        if graph:
            plt.plot(np.log10(svm_params),
                     scores,
                     label='kernel_param = ' + str(kernel_param))

    if graph:
        plt.title('Evolution of the accuracy wrt log(C)')
        plt.legend()
        plt.savefig('../res/cross_val' + str(label) + '.png')
        plt.show()

    return best_clf
Beispiel #3
0
def create_models(labels, params, train_size=0.75):
    """
    Create the model for each label and save the prediction on the test set
    on a csv file

    Parameters:
        - labels: list, list of the labels
        - params: dict, dictionnary containing the kernel method and
                        the SVM parameter
        - train_size: float, proportion of the train part in the data set
    """
    ytes = np.array([])

    for label in labels:
        print("\n*******Treating group " + str(label) + "*******\n")

        Xtr, ytr = get_train(label)
        Xte = get_test(label)

        Xtr, ytr, Xv, yv = train_test_split(Xtr, ytr, train_size)

        method, c = params[label]

        kernel, kernel_param = kernels.select_method(method)

        gram_file = "../gram_matrix/gramMat_" + str(
            label) + "_" + method + ".p"
        clf = SVM.SupportVectorMachine(kernel=kernel,
                                       C=c,
                                       kernel_params=kernel_param)
        clf.fit(Xtr, ytr, gram_file)

        score = accuracy(clf.predict(Xv), yv)
        print("Accuracy score = " + str(score) + '\n')

        ytes = np.concatenate((ytes, clf.predict(Xte)), axis=None)

    results = pd.DataFrame({'Id': list(range(3000)), 'Bound': ytes})
    results['Bound'] = [int((val + 1) / 2) for val in results['Bound']]
    results.to_csv('../predictions/Yte.csv', index=False)
Beispiel #4
0
def import_data_to_train(method, all=True, replace=False):

    #preparing kernels in this section but first we had to Construct training and testing data

    file = 'training_data_' + method + '.pkl'
    if not all:
        X_train, y_train, X_val, y_val, X_test = spli_test()
        X_test.loc[:, 'Id'] = -(X_test.loc[:, 'Id'] + 1)
        X = pd.concat((X_train, X_val, X_test), axis=0)
        ID = X.loc[:, 'Id']
    else:
        if trainInRepo(file) and not replace:
            X_train, y_train, X_val, y_val, X_test, K, ID = pkl.load(
                open(os.path.join(path, file), 'rb'))
        else:
            X_train, y_train, X_val, y_val, X_test = spli_test()
            X_test.loc[:, 'Id'] = -(X_test.loc[:, 'Id'] + 1)
            X = pd.concat((X_train, X_val, X_test), axis=0)
            ID = np.array(X.loc[:, 'Id'])
            K = km.select_method(X, method)
            file = 'training_data_' + method + '.pkl'
            pkl.dump([X_train, y_train, X_val, y_val, X_test, K, ID],
                     open(os.path.join(path, file), 'wb'))
    return X_train, y_train, X_val, y_val, X_test, K, ID