def mainTestPred(withhold=0, params=None): from sklearn import cross_validation import classification_methods as classif #default value for params if params == None: params = {} params = dict( { 'withhold': 0, 'load': None, 'extractFile': None, # arguments to `learn` 'options': {}, # k-fold cross-validation 'n_folds': 10, # feature functions 'ffs': ['system_call_unigram_feats'] }, **params) op = dict(params['options']) train_dir = "train" test_dir = "test" outputfile = "mypredictions.csv" # feel free to change this or take it as an argument # TODO put the names of the feature functions you've defined above in this list ffs = [feature_functions[f] for f in params['ffs']] # extract features print "extracting training features..." X_train, global_feat_dict, y_train, train_ids = extract_feats( ffs, train_dir) print "done extracting training features" print print "extracting test features..." X_test, _, y_ignore, test_ids = extract_feats( ffs, test_dir, global_feat_dict=global_feat_dict) print "done extracting test features" print # TODO make predictions on text data and write them out print "making predictions..." preds = classif.classify(X_train, y_train, X_test, **op) print "done making predictions" print print "writing predictions..." util.write_predictions(preds, test_ids, params['outputFile']) print "done!"
def mainTestIter(withhold=0, params=None): from sklearn import cross_validation import classification_methods as classif #default value for params if params == None: params = {} params = dict( { 'withhold': 0, 'load': None, 'extractFile': None, 'loadTest': False, # arguments to `learn` 'options': {}, # the option to cycle through 'option': None, # range of values to cycle through 'range': [], # k-fold cross-validation 'n_folds': 10, # names of feature functions to use 'ffs': ['system_call_count_feats', 'system_call_2gram_feats'] }, **params) train_dir = "train" test_dir = "test" # TODO put the names of the feature functions you've defined above in this list ffs = [feature_functions[f] for f in params['ffs']] print print "extracting training/testing features..." time1 = time.clock() # X_train, y_train, train_ids, X_test, y_test, test_ids = test.loadData(params, withhold, ffs) X, y, ids, _, _, _ = test.loadData(params, withhold, ffs) time2 = time.clock() print "done extracting training/testing features", time2 - time1, "s" print "%d data, %d features" % X.shape print # options for the learning engine options = params['options'] # array to store errors for various values of learning options errors = [] # iterate through each value of `params['option']` in `params['range']` # and calculate the error for that value print "iterating over values of %s from %s ... %s" % ( params['option'], params['range'][0], params['range'][-1]) print "================================================================================" for (i, value) in enumerate(params['range']): print "%s = %s" % (params['option'], str(value)) op = dict(options) op[params['option']] = value # generate k cross-validation folds kf = cross_validation.KFold(len(y), n_folds=params['n_folds'], shuffle=True) print "k-fold cross-validation with %d folds" % params['n_folds'] cv_err = [] # for each cv fold for train, tests in kf: # generate partition X_train, y_train, X_test, y_test = X[train], y[train], X[tests], y[ tests] # train and predict print "learning and predicting..." time1 = time.clock() preds = classif.classify(X_train, y_train, X_test, **op) time2 = time.clock() print "done learning and predicting, ", time2 - time1, "s" print # cross-validate cv_err.append(testCatErr(preds, y_test)) print "Err on withheld data: %f" % cv_err[-1] print # calculate mean, std. across folds cv_err_mean, cv_err_std = np.mean(cv_err), np.std(cv_err) print print "Avg. Err: %f" % cv_err_mean print "Std. Err: %f" % cv_err_std errors.append((cv_err_mean, cv_err_std)) print "--------------------------------------------------------------------------------" print "================================================================================" # tabulate results results = dict() print "Features:" print params['ffs'] print print "Options:" print options print print "Results:" print "%18s \t Err \t std" % params['option'] for (i, value) in enumerate(params['range']): print "%18s \t %f \t %f" % (value, errors[i][0], errors[i][1]) if (isinstance(value, list)): value = tuple(value) results[value] = errors[i] return results