Example #1
0
def run_boosting(training_features, training_labels, test_features, test_labels, passed_parameters = None):
    """
    Classifies the data using sklearn's ADAboost
    Does not natively support pruning so max_depth is being used for the decision tree

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """
    time_1 = time.time()

    #set up underlying decision tree classifier
    base_classifier = tree.DecisionTreeClassifier()

    #set up the boosting method
    estimator = ensemble.AdaBoostClassifier(base_estimator = base_classifier)
    
    #set up parameters for the classifier
    parameters = {'base_estimator__max_depth': range(1, 5), 'n_estimators' : range(10, 500, 50), 'learning_rate' : [.25, .5, .75, 1.0] }

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #plot the validation curves
    for param in parameters:
        if(is_number(parameters[param][0])):
            title = 'Validation Curves \n(AdaBoost)' 
            save_name = "Validation Curves - AdaBoost - %s.png" % param
            plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up parameters for the classifier
    if(passed_parameters is None):
        parameters = {'base_estimator__max_depth': range(1, 3), 'n_estimators' : range(5, 51, 5), 'learning_rate' : [1.0] }
    else:
        parameters = passed_parameters

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    #get the prediction and accuracy of the test set
    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #graph the best result
    base_classifier = tree.DecisionTreeClassifier(max_depth = classifier.best_estimator_.base_estimator_.max_depth)
    estimator = ensemble.AdaBoostClassifier(base_estimator = base_classifier, n_estimators = classifier.best_estimator_.n_estimators, learning_rate = classifier.best_estimator_.learning_rate)

    #plot the learning curve
    title = 'Learning Curves (AdaBoost - Decision Tree)\n max_depth=%i estimators=%i learning_rate=%f$' % (classifier.best_estimator_.base_estimator_.max_depth, classifier.best_estimator_.n_estimators, classifier.best_estimator_.learning_rate)
    plot_learning_curve(estimator, title, training_features, training_labels, cv=cv)
    pylab.savefig(os.path.join(results_location, 'Learning Curves - AdaBoost - Decision Tree.png'))
    
    time_3 = time.time()

    #fit the best eetimator
    estimator.fit(training_features, training_labels) 

    #plot the learning curve by number of estimators
    plot_adaclassifier(estimator, classifier.best_estimator_.n_estimators, training_features, test_features, training_labels, test_labels)
    pylab.savefig(os.path.join(results_location, 'Estimator Curves - AdaBoost - Decision Tree.png'))

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Decision Tree Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true = test_labels, y_pred = test_prediction))
    
    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true = test_labels, y_pred = test_prediction))

    return test_prediction, test_accuracy
Example #2
0
def run_decision_tree(training_features, training_labels, test_features, test_labels, passed_parameters = None, headings = None):
    """
    Classifies the data using sklearn's decision tree 
    Does not natively support pruning so max_depth is being used

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = tree.DecisionTreeClassifier()
    
    #set up parameters for the classifier
    if(passed_parameters == None):
        parameters = {'max_depth': None}
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #plot the validation curves
    for param in parameters:
        if(is_number(parameters[param][0])):
            title = 'Validation Curves \n(Decision Tree)' 
            save_name = "Validation Curves - Decision Tree - %s.png" % param
            plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = tree.DecisionTreeClassifier(max_depth = classifier.best_estimator_.max_depth, criterion = classifier.best_estimator_.criterion)
    estimator.fit(training_features, training_labels)

    #plot the learning curve
    title = 'Learning Curves \n(Decision Tree, max depth=%i)' %classifier.best_estimator_.max_depth
    plot_learning_curve(estimator, title, training_features, training_labels, cv=cv)
    pylab.savefig(os.path.join(results_location, 'Learning Curves - Decision Tree.png'))
    #plt.show()

    #save the visualization of the decision tree only use the top 5 levels for now
    tree_data = StringIO() 
    tree.export_graphviz(estimator, out_file=tree_data, max_depth=5, feature_names=headings)
    graph = pydot.graph_from_dot_data(tree_data.getvalue()) 
    graph.write_pdf(os.path.join(results_location, "Decision Tree Model.pdf")) 

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Decision Tree Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true = test_labels, y_pred = test_prediction))
    
    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true = test_labels, y_pred = test_prediction))

    return test_prediction, test_accuracy
Example #3
0
def run_k_nearest_neighbors(training_features, training_labels, test_features, test_labels, passed_parameters = None):
    """
    Classifies the data using sklearn's k nearest neighbors classifier

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        k: number of nearest neighbors used in the algorithm
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = neighbors.KNeighborsClassifier()
    
    #set up parameters for the classifier
    if(passed_parameters is None):
        parameters = {'n_neighbors': range(1, 11), 'weights': ['uniform', 'distance'], 'p': [1, 2] }
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #plot the validation curves
    for param in parameters:
        if(is_number(parameters[param][0])):
            title = 'Validation Curves \n(kNN)' 
            save_name = "Validation Curves - kNN - %s.png" % param
            plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = neighbors.KNeighborsClassifier(n_neighbors = classifier.best_estimator_.n_neighbors, weights = classifier.best_estimator_.weights, algorithm = classifier.best_estimator_.algorithm, leaf_size = classifier.best_estimator_.leaf_size, p = classifier.best_estimator_.p, metric = classifier.best_estimator_.metric)

    #plot the learning curve
    title = 'Learning Curves \n(k-NN, k-neighbors=%i weights=%s algorithm=%s leaf size=%i p=%i )' % (classifier.best_estimator_.n_neighbors, classifier.best_estimator_.weights, classifier.best_estimator_.algorithm, classifier.best_estimator_.leaf_size, classifier.best_estimator_.p)
    plot_learning_curve(estimator, title, training_features, training_labels, cv=cv)
    pylab.savefig(os.path.join(results_location, 'Learning Curves - kNN.png'))
    #plt.show()

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("kNN Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true = test_labels, y_pred = test_prediction))
    
    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true = test_labels, y_pred = test_prediction))

    return test_prediction, test_accuracy
Example #4
0
def run_support_vector_machines(training_features, training_labels, test_features, test_labels, passed_parameters = None):
    """
    Classifies the data using sklearn's support vector machine classifier

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        kernel: (optional) Kernel to be used in the svm classifier can be 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = svm.SVC()
    
    #set up parameters that will be used by all kernels
    if(passed_parameters is None):
        parameters = {'C': [1e0, 5e0, 1e1, 5e1]}
    else:
        parameters = passed_parameters 

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #plot the validation curves
    for param in parameters:
        if(is_number(parameters[param][0])):
            title = 'Validation Curves'
            save_name = "Validation Curves - SVC - %s.png" % param
            plot_validation_curve(estimator, training_features, training_labels, title, param, parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = svm.SVC(kernel = classifier.best_estimator_.kernel, C = classifier.best_estimator_.C, gamma = classifier.best_estimator_.gamma, degree = classifier.best_estimator_.degree)

    #plot the learning curve
    title = 'Learning Curves (SVM, kernel=%s degree=%i gamma=%f C=%i )' % (classifier.best_estimator_.kernel, classifier.best_estimator_.degree, classifier.best_estimator_.gamma, classifier.best_estimator_.C)
    plot_learning_curve(estimator, title, training_features, training_labels, cv=cv)
    save_file_name = 'Learning Curves - SVM.png'
    pylab.savefig(os.path.join(results_location, save_file_name))
    #plt.show()

    time_3 = time.time()

    if(classifier.best_estimator_.kernel == 'linear'):
        coefficients = classifier.estimator.coef_
        print('\n\n-----------------------')
        print(' Coefficients')
        print(coefficients)

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("SVM Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true = test_labels, y_pred = test_prediction))
    
    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true = test_labels, y_pred = test_prediction))

    return test_prediction, test_accuracy
Example #5
0
def run_boosting(training_features,
                 training_labels,
                 test_features,
                 test_labels,
                 passed_parameters=None):
    """
    Classifies the data using sklearn's ADAboost
    Does not natively support pruning so max_depth is being used for the decision tree

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """
    time_1 = time.time()

    #set up underlying decision tree classifier
    base_classifier = tree.DecisionTreeClassifier()

    #set up the boosting method
    estimator = ensemble.AdaBoostClassifier(base_estimator=base_classifier)

    #set up parameters for the classifier
    parameters = {
        'base_estimator__max_depth': range(1, 5),
        'n_estimators': range(10, 500, 50),
        'learning_rate': [.25, .5, .75, 1.0]
    }

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #plot the validation curves
    for param in parameters:
        if (is_number(parameters[param][0])):
            title = 'Validation Curves \n(AdaBoost)'
            save_name = "Validation Curves - AdaBoost - %s.png" % param
            plot_validation_curve(estimator, training_features,
                                  training_labels, title, param,
                                  parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up parameters for the classifier
    if (passed_parameters is None):
        parameters = {
            'base_estimator__max_depth': range(1, 3),
            'n_estimators': range(5, 51, 5),
            'learning_rate': [1.0]
        }
    else:
        parameters = passed_parameters

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    #get the prediction and accuracy of the test set
    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #graph the best result
    base_classifier = tree.DecisionTreeClassifier(
        max_depth=classifier.best_estimator_.base_estimator_.max_depth)
    estimator = ensemble.AdaBoostClassifier(
        base_estimator=base_classifier,
        n_estimators=classifier.best_estimator_.n_estimators,
        learning_rate=classifier.best_estimator_.learning_rate)

    #plot the learning curve
    title = 'Learning Curves (AdaBoost - Decision Tree)\n max_depth=%i estimators=%i learning_rate=%f$' % (
        classifier.best_estimator_.base_estimator_.max_depth,
        classifier.best_estimator_.n_estimators,
        classifier.best_estimator_.learning_rate)
    plot_learning_curve(estimator,
                        title,
                        training_features,
                        training_labels,
                        cv=cv)
    pylab.savefig(
        os.path.join(results_location,
                     'Learning Curves - AdaBoost - Decision Tree.png'))

    time_3 = time.time()

    #fit the best eetimator
    estimator.fit(training_features, training_labels)

    #plot the learning curve by number of estimators
    plot_adaclassifier(estimator, classifier.best_estimator_.n_estimators,
                       training_features, test_features, training_labels,
                       test_labels)
    pylab.savefig(
        os.path.join(results_location,
                     'Estimator Curves - AdaBoost - Decision Tree.png'))

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Decision Tree Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy
Example #6
0
def run_k_nearest_neighbors(training_features,
                            training_labels,
                            test_features,
                            test_labels,
                            passed_parameters=None):
    """
    Classifies the data using sklearn's k nearest neighbors classifier

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        k: number of nearest neighbors used in the algorithm
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = neighbors.KNeighborsClassifier()

    #set up parameters for the classifier
    if (passed_parameters is None):
        parameters = {
            'n_neighbors': range(1, 11),
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #plot the validation curves
    for param in parameters:
        if (is_number(parameters[param][0])):
            title = 'Validation Curves \n(kNN)'
            save_name = "Validation Curves - kNN - %s.png" % param
            plot_validation_curve(estimator, training_features,
                                  training_labels, title, param,
                                  parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = neighbors.KNeighborsClassifier(
        n_neighbors=classifier.best_estimator_.n_neighbors,
        weights=classifier.best_estimator_.weights,
        algorithm=classifier.best_estimator_.algorithm,
        leaf_size=classifier.best_estimator_.leaf_size,
        p=classifier.best_estimator_.p,
        metric=classifier.best_estimator_.metric)

    #plot the learning curve
    title = 'Learning Curves \n(k-NN, k-neighbors=%i weights=%s algorithm=%s leaf size=%i p=%i )' % (
        classifier.best_estimator_.n_neighbors,
        classifier.best_estimator_.weights,
        classifier.best_estimator_.algorithm,
        classifier.best_estimator_.leaf_size, classifier.best_estimator_.p)
    plot_learning_curve(estimator,
                        title,
                        training_features,
                        training_labels,
                        cv=cv)
    pylab.savefig(os.path.join(results_location, 'Learning Curves - kNN.png'))
    #plt.show()

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("kNN Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy
Example #7
0
def run_decision_tree(training_features,
                      training_labels,
                      test_features,
                      test_labels,
                      passed_parameters=None,
                      headings=None):
    """
    Classifies the data using sklearn's decision tree 
    Does not natively support pruning so max_depth is being used

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = tree.DecisionTreeClassifier()

    #set up parameters for the classifier
    if (passed_parameters == None):
        parameters = {'max_depth': None}
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #plot the validation curves
    for param in parameters:
        if (is_number(parameters[param][0])):
            title = 'Validation Curves \n(Decision Tree)'
            save_name = "Validation Curves - Decision Tree - %s.png" % param
            plot_validation_curve(estimator, training_features,
                                  training_labels, title, param,
                                  parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = tree.DecisionTreeClassifier(
        max_depth=classifier.best_estimator_.max_depth,
        criterion=classifier.best_estimator_.criterion)
    estimator.fit(training_features, training_labels)

    #plot the learning curve
    title = 'Learning Curves \n(Decision Tree, max depth=%i)' % classifier.best_estimator_.max_depth
    plot_learning_curve(estimator,
                        title,
                        training_features,
                        training_labels,
                        cv=cv)
    pylab.savefig(
        os.path.join(results_location, 'Learning Curves - Decision Tree.png'))
    #plt.show()

    #save the visualization of the decision tree only use the top 5 levels for now
    tree_data = StringIO()
    tree.export_graphviz(estimator,
                         out_file=tree_data,
                         max_depth=5,
                         feature_names=headings)
    graph = pydot.graph_from_dot_data(tree_data.getvalue())
    graph.write_pdf(os.path.join(results_location, "Decision Tree Model.pdf"))

    time_3 = time.time()

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("Decision Tree Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy
Example #8
0
def run_support_vector_machines(training_features,
                                training_labels,
                                test_features,
                                test_labels,
                                passed_parameters=None):
    """
    Classifies the data using sklearn's support vector machine classifier

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        kernel: (optional) Kernel to be used in the svm classifier can be 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """

    time_1 = time.time()

    estimator = svm.SVC()

    #set up parameters that will be used by all kernels
    if (passed_parameters is None):
        parameters = {'C': [1e0, 5e0, 1e1, 5e1]}
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0],
                      n_iter=5,
                      test_size=0.2,
                      random_state=0)

    #plot the validation curves
    for param in parameters:
        if (is_number(parameters[param][0])):
            title = 'Validation Curves'
            save_name = "Validation Curves - SVC - %s.png" % param
            plot_validation_curve(estimator, training_features,
                                  training_labels, title, param,
                                  parameters[param], cv)
            pylab.savefig(os.path.join(results_location, save_name))

    #set up tuning algorithm
    classifier = GridSearchCV(estimator=estimator,
                              cv=cv,
                              param_grid=parameters)

    #fit the classifier
    classifier.fit(training_features, training_labels)

    test_prediction = classifier.predict(test_features)
    test_accuracy = classifier.score(test_features, test_labels)

    time_2 = time.time()

    #show the best result
    estimator = svm.SVC(kernel=classifier.best_estimator_.kernel,
                        C=classifier.best_estimator_.C,
                        gamma=classifier.best_estimator_.gamma,
                        degree=classifier.best_estimator_.degree)

    #plot the learning curve
    title = 'Learning Curves (SVM, kernel=%s degree=%i gamma=%f C=%i )' % (
        classifier.best_estimator_.kernel, classifier.best_estimator_.degree,
        classifier.best_estimator_.gamma, classifier.best_estimator_.C)
    plot_learning_curve(estimator,
                        title,
                        training_features,
                        training_labels,
                        cv=cv)
    save_file_name = 'Learning Curves - SVM.png'
    pylab.savefig(os.path.join(results_location, save_file_name))
    #plt.show()

    time_3 = time.time()

    if (classifier.best_estimator_.kernel == 'linear'):
        coefficients = classifier.estimator.coef_
        print('\n\n-----------------------')
        print(' Coefficients')
        print(coefficients)

    #output time stats
    #time 1 -> time 2 is optimization time
    #time 2 -> time 3 is run for just one case
    print("SVM Time Stats")
    print("Optimization Time -> %f" % (time_2 - time_1))
    print("Single Run Time -> %f" % (time_3 - time_2))

    #output classification report and confusion matrix
    print('\n\n----------------------------')
    print('Classification Report')
    print('----------------------------\n')
    print(classification_report(y_true=test_labels, y_pred=test_prediction))

    print('\n\n----------------------------')
    print('Confusion Matrix')
    print('----------------------------\n')
    print(confusion_matrix(y_true=test_labels, y_pred=test_prediction))

    return test_prediction, test_accuracy