def mainVisualize(params={}):
    withhold = 0

    #default value for params
    params = test.defParams(params)

    train_dir = "train"
    test_dir = "test"

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [system_call_2gram_feats]  #[system_call_count_feats]
    #ffs = [first_last_system_call_feats, system_call_count_feats]

    # extract features
    print "extracting training features..."
    time1 = time.clock()
    X_train, t_train, train_ids, X_test, t_test, test_ids = test.loadData(
        params, withhold, ffs)
    time2 = time.clock()
    print "done extracting %d training features, time: %.4f s" % (
        X_train.shape[1], time2 - time1)
    print

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm

    plt.matshow(X_train.T.toarray(), cmap=cm.get_cmap('Reds'))
    plt.colorbar()
    plt.show()
def mainTest(withhold=0, params={}):
    #default value for params
    params = test.defParams(params)

    train_dir = "train"
    test_dir = "test"

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [system_call_count_feats, system_call_2gram_feats]
    #ffs = [first_last_system_call_feats, system_call_count_feats]

    # extract features
    print "extracting training features..."
    time1 = time.clock()
    X_train, t_train, train_ids, X_test, y_test, test_ids = test.loadData(
        params, withhold, ffs)
    time2 = time.clock()
    print "done extracting %d training features, time: %.4f s" % (
        X_train.shape[1], time2 - time1)
    print

    #preds = methods.logRegress(X_train,t_train,X_test)
    #preds = methods.decisionTree(X_train,t_train,X_test)
    #preds = methods.randomForest(X_train,t_train,X_test)
    preds = methods.extraTrees(X_train, t_train, X_test)

    if withhold != 0:
        print testCatAcc(preds, y_test)

    if params['writePredict'] == True:
        print "writing predictions..."
        util.write_predictions(preds, test_ids, params['outputFile'])
        print "done!"
def mainVisualizeFeatures(params={}):
    withhold = 0

    #default value for params
    params = test.defParams(params)

    train_dir = "train"
    test_dir = "test"

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [system_call_2gram_feats]  #[system_call_count_feats]
    #ffs = [first_last_system_call_feats, system_call_count_feats]

    # extract features
    print "extracting training features..."
    time1 = time.clock()
    X_train, global_feat_dict, t_train, train_ids = extract_feats(
        ffs, train_dir)
    time2 = time.clock()
    print "done extracting %d training features, time: %.4f s" % (
        X_train.shape[1], time2 - time1)
    print

    feature_data = np.zeros((len(global_feat_dict), 2))
    feature_names = []

    for (feature, index) in global_feat_dict.iteritems():
        feature_data[index][0] = X_train[:, index].mean()
        feature_data[index][1] = X_train[:, index].toarray().std()
        feature_names.append(feature)

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm

    ind = np.arange(len(global_feat_dict))
    feature_data.sort(axis=0)

    plt.bar(ind, feature_data[:, 0], yerr=feature_data[:, 1])
    plt.xticks(ind, feature_names, rotation='vertical')

    plt.show()