def mainTestPred(withhold=0, params=None):
    from sklearn import cross_validation
    import classification_methods as classif

    #default value for params
    if params == None:
        params = {}

    params = dict(
        {
            'withhold': 0,
            'load': None,
            'extractFile': None,

            # arguments to `learn`
            'options': {},

            # k-fold cross-validation
            'n_folds': 10,

            # feature functions
            'ffs': ['system_call_unigram_feats']
        },
        **params)

    op = dict(params['options'])

    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"  # feel free to change this or take it as an argument

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [feature_functions[f] for f in params['ffs']]

    # extract features
    print "extracting training features..."
    X_train, global_feat_dict, y_train, train_ids = extract_feats(
        ffs, train_dir)
    print "done extracting training features"
    print

    print "extracting test features..."
    X_test, _, y_ignore, test_ids = extract_feats(
        ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = classif.classify(X_train, y_train, X_test, **op)
    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(preds, test_ids, params['outputFile'])
    print "done!"
def mainTestIter(withhold=0, params=None):
    from sklearn import cross_validation
    import classification_methods as classif

    #default value for params
    if params == None:
        params = {}

    params = dict(
        {
            'withhold': 0,
            'load': None,
            'extractFile': None,
            'loadTest': False,

            # arguments to `learn`
            'options': {},

            # the option to cycle through
            'option': None,

            # range of values to cycle through
            'range': [],

            # k-fold cross-validation
            'n_folds': 10,

            # names of feature functions to use
            'ffs': ['system_call_count_feats', 'system_call_2gram_feats']
        },
        **params)

    train_dir = "train"
    test_dir = "test"

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [feature_functions[f] for f in params['ffs']]

    print
    print "extracting training/testing features..."
    time1 = time.clock()
    # X_train, y_train, train_ids, X_test, y_test, test_ids = test.loadData(params, withhold, ffs)
    X, y, ids, _, _, _ = test.loadData(params, withhold, ffs)
    time2 = time.clock()
    print "done extracting training/testing features", time2 - time1, "s"
    print "%d data, %d features" % X.shape
    print

    # options for the learning engine
    options = params['options']

    # array to store errors for various values of learning options
    errors = []

    # iterate through each value of `params['option']` in `params['range']`
    # and calculate the error for that value
    print "iterating over values of %s from %s ... %s" % (
        params['option'], params['range'][0], params['range'][-1])
    print "================================================================================"
    for (i, value) in enumerate(params['range']):
        print "%s = %s" % (params['option'], str(value))
        op = dict(options)
        op[params['option']] = value

        # generate k cross-validation folds
        kf = cross_validation.KFold(len(y),
                                    n_folds=params['n_folds'],
                                    shuffle=True)
        print "k-fold cross-validation with %d folds" % params['n_folds']
        cv_err = []

        # for each cv fold
        for train, tests in kf:

            # generate partition
            X_train, y_train, X_test, y_test = X[train], y[train], X[tests], y[
                tests]

            # train and predict
            print "learning and predicting..."
            time1 = time.clock()

            preds = classif.classify(X_train, y_train, X_test, **op)
            time2 = time.clock()
            print "done learning and predicting, ", time2 - time1, "s"
            print

            # cross-validate
            cv_err.append(testCatErr(preds, y_test))
            print "Err on withheld data: %f" % cv_err[-1]
            print

        # calculate mean, std. across folds
        cv_err_mean, cv_err_std = np.mean(cv_err), np.std(cv_err)

        print
        print "Avg. Err: %f" % cv_err_mean
        print "Std. Err: %f" % cv_err_std
        errors.append((cv_err_mean, cv_err_std))

        print "--------------------------------------------------------------------------------"

    print "================================================================================"

    # tabulate results
    results = dict()

    print "Features:"
    print params['ffs']
    print
    print "Options:"
    print options
    print

    print "Results:"
    print "%18s \t Err \t std" % params['option']
    for (i, value) in enumerate(params['range']):
        print "%18s \t %f \t %f" % (value, errors[i][0], errors[i][1])
        if (isinstance(value, list)):
            value = tuple(value)

        results[value] = errors[i]

    return results