def optimal_svm(optimal_c):
    """
    This function is to calculate AUC for optimal C chose from model selection
    """
    
    #load datasets
    train_X, train_y = load_data('train_X.csv', 'train_y.csv')
    test_X, test_y = load_data('test_X.csv', 'test_y.csv')
    train_X_pca = data_pca(0.95, train_X, train_X)
    test_X_pca = data_pca(0.95, train_X, test_X)
    train_y = np.array(train_y).ravel()
    test_y = np.array(test_y).ravel()
    #set up model with the optimal C
    my_svm = svm.SVC(kernel='linear', C=optimal_c, class_weight='auto')
    predicted_y = my_svm.fit(train_X_pca,train_y).decision_function(test_X_pca)
    fpr, tpr, tr = roc_curve(test_y, predicted_y)
    
    print auc(fpr, tpr)
def main():
    
    #load datasets
    train_X, train_Y = load_data('train_X.csv', 'train_y.csv')
    train_X_pca = data_pca(0.95, train_X, train_X)
    train = train_X_pca
    train['Y'] = train_Y
    #set a list of hyperparameter C
    c = [10**i for i in range(-9,2)]
    #conduct X cross validation and return AUCs in each sample for each C
    aucs=xValSVM(train, 'Y', 5, c)
    #calculate the average and standard error of AUC for each C
    avg, stderr = avg_stderr(aucs, c)
    #plot the results of cross validation
    plotxValSVM(avg, stderr, c)
def main():
    train_X, train_Y = load_data("train_X.csv", "train_y.csv")
    train_x, validation_x, train_y, validation_y = split_data(train_X, train_Y, 0.2)
    train_x_pca_df = data_pca(0.95, train_X, train_x)
    validation_x_pca_df = data_pca(0.95, train_X, validation_x)
    svm_model(train_x_pca_df, train_y, validation_x_pca_df, validation_y)