Ejemplo n.º 1
0
def do_prediction(train_dir, test_dir, outputfile, ffs):
    # extract features
    print "extracting training features..."
    X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
    print X_train
    print "done extracting training features"
    print

    # TODO train here, and learn your classification parameters
    print "learning..."
    # learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes)))
    learn.TRAINING_FUNCTION(X_train, global_feat_dict, t_train, train_ids)
    print "done learning"
    print
    
    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print "extracting test features..."
    X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."
    # preds = np.argmax(X_test.dot(learned_W),axis=1)
    preds = learn.TESTING_FUNCTION(X_test, global_feat_dict, test_ids)
    # preds = learn.logistic_regression(X_train, X_test, global_feat_dict, t_train, train_ids, test_ids)
    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
def mainTest(withhold=0, params={}):
    #default value for params
    params = test.defParams(params)

    train_dir = "train"
    test_dir = "test"

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [system_call_count_feats, system_call_2gram_feats]
    #ffs = [first_last_system_call_feats, system_call_count_feats]

    # extract features
    print "extracting training features..."
    time1 = time.clock()
    X_train, t_train, train_ids, X_test, y_test, test_ids = test.loadData(
        params, withhold, ffs)
    time2 = time.clock()
    print "done extracting %d training features, time: %.4f s" % (
        X_train.shape[1], time2 - time1)
    print

    #preds = methods.logRegress(X_train,t_train,X_test)
    #preds = methods.decisionTree(X_train,t_train,X_test)
    #preds = methods.randomForest(X_train,t_train,X_test)
    preds = methods.extraTrees(X_train, t_train, X_test)

    if withhold != 0:
        print testCatAcc(preds, y_test)

    if params['writePredict'] == True:
        print "writing predictions..."
        util.write_predictions(preds, test_ids, params['outputFile'])
        print "done!"
Ejemplo n.º 3
0
def syscall_count_by_type():
    mat,key,cats = pickle.load(open('matrix_train', 'rb'))
    test_mat,ids = pickle.load(open('matrix_test', 'rb'))
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(mat,cats)
    util.write_predictions(clf.predict(test_mat),ids,
                           'syscall_count_by_type-3.csv')
Ejemplo n.º 4
0
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "sample_predictions.csv"  # feel free to change this or take it as an argument

    # TODO put the names of the feature functions you've defined above in this list
    # ffs = [first_last_system_call_feats, system_call_count_feats, frequency]
    #ffs = [quadgrams]
    ffs = [first_last_system_call_feats, quadgrams]

    # extract features
    print "extracting training features..."
    X_train, global_feat_dict, t_train, train_ids = extract_feats(
        ffs, train_dir)

    # print X_train # Not currently a np.array, need to do .toarray()
    # print global_feat_dict

    print "done extracting training features"
    print

    # TODO train here, and learn your classification parameters
    print "learning..."

    # rf = RandomForestClassifier(max_features = 2750, max_depth = 28)
    # rf.fit(X_train.toarray(), t_train)

    nn = MLPClassifier(max_iter=10000, hidden_layer_sizes=(320, ))
    nn.fit(X_train.toarray(), t_train)

    # rf = RandomForestClassifier(max_features = 100, max_depth = 90)
    # rf.fit(X_train.toarray(), t_train)

    print "done learning"
    print

    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids

    print "extracting test features..."
    X_test, _, t_ignore, test_ids = extract_feats(
        ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."

    # preds = rf.predict(X_test.toarray())
    preds = nn.predict(X_test.toarray())

    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
def nb_test_data(test_data, sent_model: NaiveBayesModel,
                 auth_model: NaiveBayesModel, output_filename: str):
    predictions = []
    for (review_id, review_text) in test_data:
        sent_class = nb_predict_sentiment(sent_model, review_text)
        auth_class = nb_predict_authenticity(auth_model, review_text)
        predictions.append((review_id, auth_class, sent_class))
    write_predictions(predictions, output_filename)
Ejemplo n.º 6
0
def mainTest(withhold=0, params=None):

    #default value for params
    if params==None:
        params = {'withhold': 0,
          'load': None,
          'extractFile': None,
          'trainFile': None,
          'testFile': None,
          'writePredict': False,
          'outputFile': 'predictions.csv'
          }

    trainfile = "train.xml"
    testfile = "testcases.xml"

    # TODO put the names of the feature functions you've defined above in this list
    #ffs = [metadata_feats, unigram_feats]
    ffs = [metadata_feats, unigram_noStop]
    #ffs = [metadata_feats, bigram_feats_noStop]
    #ffs = [metadata_feats, bigram_feats_noStop, unigram_noStop]
    #totRevLen, revLens
    #ffs = [metadata_feats, unigram_noStop, revLens]

    print "extracting training/testing features..."
    time1 = time.clock()
    X_train, y_train, train_ids,X_test,y_test,test_ids = test.loadData(params, withhold, ffs)
    time2 = time.clock()
    print "done extracting training/testing features", time2-time1, "s"
    print

    # TODO train here, and return regression parameters
    print "learning..."
    time1 = time.clock()
    #learned_w = splinalg.lsqr(X_train,y_train)[0]
    learned_w = splinalg.lsmr(X_train,y_train,damp=5000)[0]
    time2 = time.clock()
    print "done learning, ", time2-time1, "s"
    print

    # get rid of training data and load test data
    del X_train
    del y_train
    del train_ids

    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = X_test.dot(learned_w)
    print "done making predictions"
    print

    if withhold > 0:
        print "MAE on withheld data:", testMAE(preds, y_test)

    if params['writePredict']==True:
        print "writing predictions..."
        util.write_predictions(preds, test_ids, params['outputFile'])
        print "done!"
def mainTestPred(withhold=0, params=None):
    from sklearn import cross_validation
    import classification_methods as classif

    #default value for params
    if params == None:
        params = {}

    params = dict(
        {
            'withhold': 0,
            'load': None,
            'extractFile': None,

            # arguments to `learn`
            'options': {},

            # k-fold cross-validation
            'n_folds': 10,

            # feature functions
            'ffs': ['system_call_unigram_feats']
        },
        **params)

    op = dict(params['options'])

    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"  # feel free to change this or take it as an argument

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [feature_functions[f] for f in params['ffs']]

    # extract features
    print "extracting training features..."
    X_train, global_feat_dict, y_train, train_ids = extract_feats(
        ffs, train_dir)
    print "done extracting training features"
    print

    print "extracting test features..."
    X_test, _, y_ignore, test_ids = extract_feats(
        ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = classif.classify(X_train, y_train, X_test, **op)
    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(preds, test_ids, params['outputFile'])
    print "done!"
Ejemplo n.º 8
0
def predict(train, test, pred_file):
    y_hat, train_rss = run_model(train, test, 'prediction', 0)
    for i, yi in enumerate(y_hat):
        if yi < 0:
            y_hat[i] = 0
        if yi > 5:
            y_hat[i] = 5
    for i, entry in enumerate(test):
        entry['rating'] = float(y_hat[i])    
    util.write_predictions(test, pred_file)
Ejemplo n.º 9
0
def screens_budget_summer_lglglg():
    mat,key,regy,_ = rs.extract_feats([rs.metadata_feats])
    screen_ind = key['number_of_screens']
    budget_ind = key['production_budget']
    summer_ind = key['summer_release']

    screens = mat.getcol(screen_ind).todense()
    budget = mat.getcol(budget_ind).todense()
    summer = mat.getcol(summer_ind).todense()

    def safelog(x):
        if x <= 0.:
            return 0.
        else:
            return math.log(x)
    fns = [safelog, safelog, safelog, safelog]
    bs_check = lambda x:x[1] > 0. and x[2] > 0.
    bns_check = lambda x:x[1] > 0. and x[2] == 0.
    nbs_check = lambda x:x[1] == 0. and x[2] > 0.
    nbns_check = lambda x:x[1] == 0. and x[2] == 0.

    bs_arr = format_arr([screens,budget,summer], regy, fns, bs_check)
    bns_arr = format_arr([screens,budget,summer], regy, fns, bns_check)
    nbs_arr = format_arr([screens,budget,summer], regy, fns, nbs_check)
    nbns_arr = format_arr([screens,budget,summer], regy, fns, nbns_check)

    budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2,
                        lambda x:x[1], lambda x:x[1]**2]
    no_budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2]

    bs_coeffs = freg.coeffs(budget_basis_fns, bs_arr)
    bns_coeffs = freg.coeffs(budget_basis_fns, bns_arr)
    nbs_coeffs = freg.coeffs(no_budget_basis_fns, nbs_arr)
    nbns_coeffs = freg.coeffs(no_budget_basis_fns, nbns_arr)

    test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml',
                                    global_feat_dict = key)
    test_len = test.shape[0]
    preds = []
    for i in range(test_len):
        prod = 0
        x = [test[i,screen_ind], test[i,budget_ind], test[i,summer_ind]]
        logx = tuple([safelog(feat) for feat in x])
        if bs_check(x):
            prod = freg.product(logx, bs_coeffs, budget_basis_fns)
        elif bns_check(x):
            prod = freg.product(logx, bns_coeffs, budget_basis_fns)
        elif nbs_check(x):
            prod = freg.product(logx, nbs_coeffs, no_budget_basis_fns)
        elif nbns_check(x):
            prod = freg.product(logx, nbns_coeffs, no_budget_basis_fns)
        if prod < 0:
            prod = 0
        preds.append(math.e**prod)
    util.write_predictions(preds, ids, 'screens_budget_summer_lglglg-2.csv')
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "sample_predictions.csv"  # feel free to change this or take it as an argument
    
    # TODO put the names of the feature functions you've defined above in this list
    ffs = [first_last_system_call_feats, system_call_count_feats]
    
    # extract features
    print "extracting training features..."
    X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
    print "done extracting training features"
    print
    print "global_feat_dict"
    pprint(global_feat_dict)
    print "t_train"
    pprint(t_train)
    
    # TODO train here, and learn your classification parameters
    print "learning..."
    learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes)))
    print "learned_W"
    pprint(learned_W)
    print "done learning"
    print
    
    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print "extracting test features..."
    X_test,quoi,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print "Quoi"
    pprint(quoi)
    print "t_ignore"
    pprint(t_ignore)

    print


    
    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = np.argmax(X_test.dot(learned_W),axis=1)
    print "preds"
    pprint(preds)
    print "done making predictions"
    print
    
    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 11
0
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "logistic.csv"  # feel free to change this or take it as an argument

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [system_call_counts, system_call_count_feats]

    # extract features
    print "extracting training features..."
    X_train, global_feat_dict, t_train, train_ids = extract_feats(
        ffs, train_dir)
    print "done extracting training features"
    print

    # TODO train here, and learn your classification parameters
    print "learning..."
    # RF = RandomForestClassifier()
    # RF.fit(X_train, t_train)
    #learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes)))
    X_train = X_train.toarray()
    y_train = to_categorical(t_train)
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=X_train.shape[1]))
    model.add(Dense(y_train.shape[1], activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.fit(X_train, y_train, epochs=200, batch_size=64)
    print "done learning"
    print

    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print "extracting test features..."
    X_test, _, t_ignore, test_ids = extract_feats(
        ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."
    # preds = RF.predict(X_test)
    #preds = np.argmax(X_test.dot(learned_W),axis=1)
    preds_vec = model.predict(X_test.toarray())
    preds = np.argmax(preds_vec, axis=1)
    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 12
0
def main(saved_extraction=None, type_clf='tree', nb_tree=20):
    from sklearn.ensemble import RandomForestClassifier
    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"
    #YOU ADD HERE THE NEW FUNCTIONS THEY HAVE TO RETURN A COUNTER CLASS
    ffs = [first_last_system_call_feats, system_call_count_feats, syscall_name_counter, dll_type, failure_success,string_entropy, Api_call_counter]
    if saved_extraction:
        X_train,global_feat_dict,t_train,train_ids = np.load('train_extract.npy')
        X_test,_,t_ignore,test_ids = np.load('test_extract.npy')
    else:
        X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
        X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
        np.save('train_extract.npy',(X_train,global_feat_dict,t_train,train_ids))
        np.save('test_extract.npy',(X_test,_,t_ignore,test_ids))
    #CrossValidation for mScoring Purposes
    print "Number of Feature used in the analysis :", X_train.shape

    Class_weight=np.array([3.69,1.62,1.2,1.03,1.33,1.26,1.72,1.33,52.14,0.68,17.56,1.04,12.18,1.91,1.3])/100.0
    if type_clf=='tree':
        clf = RandomForestClassifier(n_estimators=nb_tree)
    elif type_clf=='Etree':
        clf = ExtraTreesClassifier(n_estimators=nb_tree)
    elif type_clf=='SVC':
        clf = svm.SVC(kernel='rbf')
    if type_clf =='tree' or type_clf=='Etree':
        weight=[]
        for i in range(0,len(t_train[:int(len(X_train)*0.75)])):
            ind=t_train[i]
            weight.append(Class_weight[ind])
        clf.fit(X_train[:int(len(X_train)*0.75)], t_train[:int(len(X_train)*0.75)], sample_weight=weight)
    else:
        clf.fit(X_train[:int(len(X_train)*0.75)], t_train[:int(len(X_train)*0.75)])
    CV_hat = clf.predict(X_train[int(len(X_train)*0.75):])
    d = (t_train[int(len(X_train)*0.75):] == CV_hat)
    print "Estimation is:",float(d.sum())/len(d)
    if type_clf=='tree':
        clf = RandomForestClassifier(n_estimators=nb_tree)
    elif type_clf=='SVC':
        clf = svm.SVC(kernel='rbf')
    elif type_clf=='Etree':
        clf = ExtraTreesClassifier(n_estimators=nb_tree)
    if type_clf =='tree' or type_clf=='Etree':
        weight=[]
        for i in range(0,len(t_train)):
            ind=t_train[i]
            weight.append(Class_weight[ind])
        clf.fit(X_train, t_train, sample_weight=weight)
    else:
        clf.fit(X_train, t_train)
    t_hat = clf.predict(X_test)
    util.write_predictions(t_hat, test_ids, outputfile)
Ejemplo n.º 13
0
def main(saved_extraction=None, type_clf='tree', nb_tree=20):
    from sklearn.ensemble import RandomForestClassifier
    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"

    #YOU ADD HERE THE NEW FUNCTIONS THEY HAVE TO RETURN A COUNTER CLASS

    ffs = [
        first_last_system_call_feats, system_call_count_feats,
        syscall_name_counter, dll_type, failure_success
    ]

    if saved_extraction:
        X_train, global_feat_dict, t_train, train_ids = np.load(
            'train_extract.npy')
        X_test, _, t_ignore, test_ids = np.load('test_extract.npy')
    else:
        X_train, global_feat_dict, t_train, train_ids = extract_feats(
            ffs, train_dir)
        X_test, _, t_ignore, test_ids = extract_feats(
            ffs, test_dir, global_feat_dict=global_feat_dict)
        np.save('train_extract.npy',
                (X_train, global_feat_dict, t_train, train_ids))
        np.save('test_extract.npy', (X_test, _, t_ignore, test_ids))
    # clf = svm.SVC()
    # CrossValidation for mScoring Purposes

    if type_clf == 'tree':
        clf = RandomForestClassifier(n_estimators=nb_tree)
    elif type_clf == 'Etree':
        clf = ExtraTreesClassifier(n_estimators=nb_tree)
    elif type_clf == 'SVC':
        clf = svm.SVC()
    clf.fit(X_train[:int(len(X_train) * 0.75)],
            t_train[:int(len(X_train) * 0.75)])
    CV_hat = clf.predict(X_train[int(len(X_train) * 0.75):])
    d = (t_train[int(len(X_train) * 0.75):] == CV_hat)
    print "Estimation is:", float(d.sum()) / len(d)
    if type_clf == 'tree':
        clf = RandomForestClassifier(n_estimators=nb_tree)
    elif type_clf == 'SVC':
        clf = svm.SVC()
    elif type_clf == 'Etree':
        clf = ExtraTreesClassifier(n_estimators=nb_tree)
    clf.fit(X_train, t_train)
    t_train_hat = clf.predict(X_train)
    t_hat = clf.predict(X_test)
    util.write_predictions(t_hat, test_ids, outputfile)

    return X_train, global_feat_dict, t_train, train_ids
Ejemplo n.º 14
0
def main():
    X_train, t_train, train_ids = create_data_matrix(0, 10000, TRAIN_DIR)
    # X_valid, t_valid, valid_ids = create_data_matrix(1000, 2000, TRAIN_DIR)
    X_test, t_test, test_ids = create_data_matrix(0, 3724, TEST_DIR)

    # print 'Data matrix (training set):'
    # print np.array(X_train)
    # print 'Classes (training set):'
    # print np.array(t_train)

    clf = RandomForestClassifier(n_estimators=20,
                                 max_depth=None,
                                 max_features=1,
                                 criterion="gini",
                                 min_samples_split=1,
                                 min_samples_leaf=1,
                                 bootstrap=False)
    # clf = Regressor(
    #     layers=[
    #         Layer("Rectifier", units=100),
    #         Layer("Linear")],
    #     learning_rate=0.001,
    #     n_iter=100000)

    # use a full grid over all parameters
    # param_grid = {"max_depth": [3, None],
    #           "max_features": [1, 3, 10],
    #           "min_samples_split": [1, 3, 10],
    #           "min_samples_leaf": [1, 3, 10],
    #           "bootstrap": [True, False],
    #           "criterion": ["gini", "entropy"]}

    # # run grid search
    # grid_search = GridSearchCV(clf, param_grid=param_grid)
    # grid_search.fit(X_train, t_train)
    # preds = grid_search.predict(X_test)
    # print grid_search.best_params_

    clf = clf.fit(X_train, t_train)
    preds = clf.predict(X_test)
    # right = 0
    # wrong = 0
    # for p, pred in enumerate(preds):
    #     if np.round(pred) == t_valid[p]:
    #         right +=1
    #     else:
    #         wrong +=1
    # print right
    # print wrong
    ut.write_predictions(preds, test_ids, "result.csv")
Ejemplo n.º 15
0
def main():
    print "# Loading features..."
    X_train, t_train, _ = pickle.load(open("../../features/all_tags/train.pickle"))
    X_test, _, test_ids = pickle.load(open("../../features/all_tags/test.pickle"))

    print "# Training RandomForestClassifier on train data..."
    RFC = RandomForestClassifier(n_estimators = 40, n_jobs = -1)
    RFC.fit(X_train, t_train)

    print "# Predicting test data..."
    pred = RFC.predict(X_test)

    util.write_predictions(pred, test_ids, "../../predictions/single_RF_predictions.csv")

    print "# Done!"
Ejemplo n.º 16
0
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"  # feel free to change this or take it as an argument

    lr = cl.LogisticRegression()
    knn = cl.kNN()
    
    ds = ff.Dataset()

    print "training..."

    X, y, ids = ds.getDataset(train_dir)
    lr.fit(X,y)
    knn.fit(X,y)
    del X
    del y
    print "training complete. Now preparing for submit"

    X, y, ids = ds.getDataset(test_dir)
    predsLR = lr.predict(X)
    pbLR    = lr.classifier_().predict_proba(X)

    predskNN = knn.predict(X)
    pbkNN = knn.classifier_().predict_proba(X)

    featDict = ds.getFeatureDict()
    #print "feature", featDict['Swizzor_found']
    X_arr = X.toarray()

    finalpred = []
    for i in xrange(len(predsLR)):
        if X_arr[i][featDict['Swizzor_found']] > 0:
            choice = 10
        elif np.max(pbkNN[i]) > 0.8:
            # if kNN is more .8 sure, it is very accurate
            choice = predskNN[i]
        elif np.max(pbkNN[i]) - np.max(pbLR[i]) > 0.4:
            # if kNN is 0.4 more sure than LR, use that
            choice = predskNN[i]
        else:
            choice = predsLR[i]
        finalpred.append(choice)
    
    print "writing predictions..."
    util.write_predictions(finalpred, ids, outputfile)
    print "done!"
Ejemplo n.º 17
0
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"  # feel free to change this or take it as an argument

    lr = cl.LogisticRegression()
    knn = cl.kNN()

    ds = ff.Dataset()

    print "training..."

    X, y, ids = ds.getDataset(train_dir)
    lr.fit(X, y)
    knn.fit(X, y)
    del X
    del y
    print "training complete. Now preparing for submit"

    X, y, ids = ds.getDataset(test_dir)
    predsLR = lr.predict(X)
    pbLR = lr.classifier_().predict_proba(X)

    predskNN = knn.predict(X)
    pbkNN = knn.classifier_().predict_proba(X)

    featDict = ds.getFeatureDict()
    #print "feature", featDict['Swizzor_found']
    X_arr = X.toarray()

    finalpred = []
    for i in xrange(len(predsLR)):
        if X_arr[i][featDict['Swizzor_found']] > 0:
            choice = 10
        elif np.max(pbkNN[i]) > 0.8:
            # if kNN is more .8 sure, it is very accurate
            choice = predskNN[i]
        elif np.max(pbkNN[i]) - np.max(pbLR[i]) > 0.4:
            # if kNN is 0.4 more sure than LR, use that
            choice = predskNN[i]
        else:
            choice = predsLR[i]
        finalpred.append(choice)

    print "writing predictions..."
    util.write_predictions(finalpred, ids, outputfile)
    print "done!"
Ejemplo n.º 18
0
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "013.csv"  # feel free to change this or take it as an argument

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [
        first_last_system_call_feats, system_call_count_feats, count_all_feats,
        count_all_reasons, count_all_flags
    ]

    # extract features
    print "extracting training features..."
    X_train, global_feat_dict, t_train, train_ids = extract_feats(
        ffs, train_dir)
    print "done extracting training features"
    print

    # TODO train here, and learn your classification parameters
    print "learning..."
    model = RandomForestClassifier(n_estimators=300, n_jobs=-1)
    model.fit(X_train, t_train)
    print "done learning"
    print

    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print "extracting test features..."
    X_test, _, t_ignore, test_ids = extract_feats(
        ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = model.predict(X_test)
    # preds = np.argmax(X_test.dot(learned_W),axis=1)
    print preds
    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 19
0
    def GetProduceSolutionResults(self, request, context):
        """
        TA2-3 API call
        """
        logging.critical("Message received: GetProduceSolutionResults")
        request_id = request.request_id
        request_params = self._solution_score_map[request_id]
        start = solutiondescription.compute_timestamp()

        solution_id = request_params.fitted_solution_id
        solution = self._solutions[solution_id]

        inputs = self._get_inputs(solution.problem, request_params.inputs)
        try:
            output = solution.produce(inputs=inputs,
                                      solution_dict=self._solutions)[0]
            logging.critical("Produce predictions with rows = %s", len(output))
        except:
            logging.critical("Exception in produce: %s", solution.primitives)
            logging.critical("Exception in produce: %s", sys.exc_info()[0])
            output = None

        result = None
        search_id_str = self._solution_to_search[solution_id]
        outputDir = os.environ['D3MOUTPUTDIR'] + "/" + search_id_str

        if output is not None:
            uri = util.write_predictions(output, outputDir + "/predictions",
                                         request_id)
            uri = 'file://{uri}'.format(uri=os.path.abspath(uri))
            result = value_pb2.Value(csv_uri=uri)
        else:
            result = value_pb2.Value(error=value_pb2.ValueError(
                message="Output is NULL"))

        self._solution_score_map.pop(request_id, None)

        msg = core_pb2.Progress(state=core_pb2.COMPLETED,
                                status="",
                                start=start,
                                end=solutiondescription.compute_timestamp())

        steps = []
        for i in range(solution.num_steps()):
            steps.append(core_pb2.StepProgress(progress=msg))

        exposed_outputs = {}
        if request_params.expose_outputs is not None and len(
                request_params.expose_outputs) > 0:
            last_step_output = request_params.expose_outputs[
                len(request_params.expose_outputs) - 1]
        else:
            last_step_output = solution.outputs[0][2]

        exposed_outputs[last_step_output] = result

        yield core_pb2.GetProduceSolutionResultsResponse(
            progress=msg, steps=steps, exposed_outputs=exposed_outputs)
Ejemplo n.º 20
0
def get_comparable_performance_test():
    result = write_predictions(raw_test_vua, test_dataloader_vua, RNNseq_model, using_GPU, '../data/VUAsequence/VUA_seq_formatted_test.csv')
    f = open('../predictions/vua_seq_test_predictions_LSTMsequence_vua.csv', 'w')
    writer = csv.writer(f)
    writer.writerows(result)
    f.close()

    get_performance_VUAverb_test()
    get_performance_VUA_test()
Ejemplo n.º 21
0
def screens(basis_fns, fns, inv_fn, outfile):
    mat,key,regy,_ = rs.extract_feats([rs.metadata_feats])
    screen_ind = key['number_of_screens']
    screens = mat.getcol(screen_ind).todense()
    train_arr = format_arr([screens], regy, fns)
    coeffs = freg.coeffs(basis_fns, train_arr)

    test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml',
                                    global_feat_dict = key)
    test_len = test.shape[0]
    preds = []
    for i in range(test_len):
        prod = freg.product((fns[0](test[i,screen_ind]),), coeffs, basis_fns)
        if prod < 0:
            prod = 0
        preds.append(inv_fn(prod))

    util.write_predictions(preds, ids, outfile)
Ejemplo n.º 22
0
def main():
    X_train, t_train, train_ids = create_data_matrix(0, 10000, TRAIN_DIR)
    # X_valid, t_valid, valid_ids = create_data_matrix(1000, 2000, TRAIN_DIR)
    X_test, t_test, test_ids = create_data_matrix(0,3724, TEST_DIR)

    # print 'Data matrix (training set):'
    # print np.array(X_train)
    # print 'Classes (training set):'
    # print np.array(t_train)

    clf = RandomForestClassifier(n_estimators=20, max_depth = None, max_features =1, criterion = "gini", min_samples_split = 1, min_samples_leaf = 1, bootstrap = False)
    # clf = Regressor(
    #     layers=[
    #         Layer("Rectifier", units=100),
    #         Layer("Linear")],
    #     learning_rate=0.001,
    #     n_iter=100000)

    # use a full grid over all parameters
    # param_grid = {"max_depth": [3, None],
    #           "max_features": [1, 3, 10],
    #           "min_samples_split": [1, 3, 10],
    #           "min_samples_leaf": [1, 3, 10],
    #           "bootstrap": [True, False],
    #           "criterion": ["gini", "entropy"]}

    # # run grid search
    # grid_search = GridSearchCV(clf, param_grid=param_grid)
    # grid_search.fit(X_train, t_train)
    # preds = grid_search.predict(X_test)
    # print grid_search.best_params_

    clf = clf.fit(X_train, t_train)
    preds = clf.predict(X_test)
    # right = 0
    # wrong = 0
    # for p, pred in enumerate(preds):
    #     if np.round(pred) == t_valid[p]:
    #         right +=1
    #     else:
    #         wrong +=1
    # print right
    # print wrong
    ut.write_predictions(preds, test_ids, "result.csv")
Ejemplo n.º 23
0
def prediction(train_valid, test, pred_filename):
    
    import data_processing as dp
    dphelper = dp.data_processing()
    dense_train, sparse_train = dphelper.split(train_valid)
    dense_test, sparse_test = dphelper.split(test)
        
    #######
    import sgd_bias as sgd
    y_hat_dense, train_rmse_dense = sgd.sgd_bias(dense_train, dense_test, 'prediction')
    
    import baseline as bs
    y_hat_sparse, train_rmse_sparse = bs.baseline(sparse_train, sparse_test, 'prediction')
    
    #######
    print 'dense subset train rmse: %.16f' % train_rmse_dense
    print 'sparse subset train rmse: %.16f' % train_rmse_sparse
    test = dphelper.merge(test, y_hat_dense, y_hat_sparse)
    util.write_predictions(test, pred_filename) 
Ejemplo n.º 24
0
def runCosine(training_set, user_list, validation_set, test_queries):
    global dataChoice
    users = {}
    for row in training_set:
        user_id = row['user']
        isbn = row['isbn']
        if not user_id in users:
            users[user_id] = {}
            users[user_id]['ratings'] = {}
        users[user_id]['ratings'][isbn] = row['rating']

    # calculate cosine distance and find closest match
    cosine.topMatch(users)

    # find mean rating per book
    books, global_mean = cosine.meanPerItem(users)

    total_error = 0.0
    sample_count = 0

    if dataChoice == 'validate':
        print "user\tprediction\tactual"

        for row in validation_set:
            user = row['user']
            isbn = row['isbn']

            prediction = cosine.predict(users, user, books, isbn, global_mean)
            print user, "\t", prediction, "\t\t", row['rating']
            total_error += abs(prediction - row['rating'])
            sample_count += 1
            return total_error / sample_count
    else:
        # dataChoice = 'full'
        for query in test_queries:
            user_id = query['user']
            isbn = query['isbn']
            query['rating'] = cosine.predict(users, user_id, books, isbn,
                                             global_mean)

        # Write the prediction file.
        util.write_predictions(test_queries, pred_filename)
Ejemplo n.º 25
0
def runCosine(training_set,user_list, validation_set, test_queries):
    global dataChoice
    users = {}
    for row in training_set:
        user_id = row['user']
        isbn    = row['isbn']
        if not user_id in users: 
            users[user_id] = {}
            users[user_id]['ratings'] = {}
        users[user_id]['ratings'][isbn] =  row['rating']

    # calculate cosine distance and find closest match
    cosine.topMatch(users)

    # find mean rating per book
    books, global_mean = cosine.meanPerItem(users)

    total_error = 0.0
    sample_count  = 0

    if dataChoice == 'validate':
        print "user\tprediction\tactual"

        for row in validation_set:
            user = row['user']
            isbn  = row['isbn']
        
            prediction = cosine.predict(users,user,books,isbn,global_mean)
            print user,"\t",prediction,"\t\t",row['rating']
            total_error += abs(prediction - row['rating'])
            sample_count += 1
            return total_error / sample_count
    else:
        # dataChoice = 'full'
        for query in test_queries:
            user_id = query['user']
            isbn    = query['isbn']
            query['rating'] = cosine.predict(users,user_id,books,isbn,global_mean)

        # Write the prediction file.
        util.write_predictions(test_queries, pred_filename)
Ejemplo n.º 26
0
def main(X_train=None, global_feat_dict=None):
    trainfile = "train.xml"
    testfile = "testcases.xml"
    outputfile = "mypredictions2.csv"  # feel free to change this or take it as an argument

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [metadata_feats, unigram_feats]

    if X_train == None and global_feat_dict == None:
        # extract features
        print "extracting training features..."
        X_train,global_feat_dict,y_train,train_ids = extract_feats(ffs, trainfile)
        print "done extracting training features"
        print

    # TODO train here, and return regression parameters
    print "learning..."
    #learned_w = splinalg.lsqr(X_train,y_train)[0]
    learned_w = splinalg.lsmr(X_train,y_train)[0]
    print "done learning"
    print

    # get rid of training data and load test data
    del X_train
    del y_train
    del train_ids
    print "extracting test features..."
    X_test,_,y_ignore,test_ids = extract_feats(ffs, testfile, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = X_test.dot(learned_w)
    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 27
0
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"  # feel free to change this or take it as an argument
    
    # TODO put the names of the feature functions you've defined above in this list
    ffs = [first_last_system_call_feats, system_call_count_feats]
    
    # extract features
    print "extracting training features..."
    X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
    print "done extracting training features"
    print
    
    # TODO train here, and learn your classification parameters
    print "learning..."
    learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes)))
    print "done learning"
    print
    
    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print "extracting test features..."
    X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print
    
    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = np.argmax(X_test.dot(learned_W),axis=1)
    print "done making predictions"
    print
    
    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 28
0
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"  # feel free to change this or take it as an argument

    # extract features
    print "extracting training features..."
    X_train, global_feat_dict, t_train, train_ids = extract_feats(
        ffs, train_dir)
    print "done extracting training features"
    print

    # TODO train here, and learn your classification parameters
    print "learning..."
    learned_W = np.random.random(
        (len(global_feat_dict), len(util.malware_classes)))
    print "done learning"
    print

    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print "extracting test features..."
    X_test, _, t_ignore, test_ids = extract_feats(
        ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = np.argmax(X_test.dot(learned_W), axis=1)
    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 29
0
def main():
    print "# Loading features..."
    X_train, t_train, _ = pickle.load(open("../../features/all_tags/train.pickle"))
    X_test, _, test_ids = pickle.load(open("../../features/all_tags/test.pickle"))

    dtrain = xgb.DMatrix(X_train, label=t_train)

    print "# Training XGBoost on training data..."
    param = {'bst:max_depth':30, 'eta':0.1, 'silent':2, 'objective':'multi:softprob', 'num_class': 15 }
    param['eval_metric'] = 'merror'
    param['min_child_weight'] = 3
    param['nthread'] = 16
    param['colsample_bytree'] = 0.5
    evallist = [(dtrain,'train')]
    bst = xgb.train(param, dtrain, 500, evallist)

    print "# Predicting test data..."
    dout = xgb.DMatrix(X_test)
    t_probs = bst.predict(dout)
    t_pred = [prob.tolist().index(max(prob)) for prob in t_probs]

    util.write_predictions(t_pred, test_ids, "../../predictions/xgboost_predictions.csv")

    print "# Done!"
Ejemplo n.º 30
0
def screens_budget_lglglg():
    mat,key,regy,_ = rs.extract_feats([rs.metadata_feats])
    screen_ind = key['number_of_screens']
    budget_ind = key['production_budget']
    screens = mat.getcol(screen_ind).todense()
    budget = mat.getcol(budget_ind).todense()

    budget_fns = [lambda x:math.log(x) for i in range(3)]
    budget_check = lambda x:x[1] > 0.
    budget_arr = format_arr([screens,budget], regy, budget_fns, budget_check)
    no_budget_arr = format_arr([screens],regy,[lambda x:math.log(x),lambda x:math.log(x)])

    budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2,
                        lambda x:x[1], lambda x:x[1]**2]
    no_budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2]

    budget_coeffs = freg.coeffs(budget_basis_fns, budget_arr)
    no_budget_coeffs = freg.coeffs(no_budget_basis_fns, no_budget_arr)

    test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml',
                                    global_feat_dict = key)
    test_len = test.shape[0]
    preds = []
    for i in range(test_len):
        prod = 0
        if test[i,budget_ind] > 0.:
            x = (budget_fns[0](test[i,screen_ind]),
                 budget_fns[1](test[i,budget_ind]))
            prod = freg.product(x, budget_coeffs, budget_basis_fns)
        else:
            x = (math.log(test[i,screen_ind]),)
            prod = freg.product(x, no_budget_coeffs, no_budget_basis_fns)
        if prod < 0:
            prod = 0
        preds.append(math.e**prod)
    util.write_predictions(preds, ids, 'screens_budget_lglglg-2.csv')
Ejemplo n.º 31
0
def main():
    final_ids = []
    final_prediction = []

    # fetch features for training and test data
    # substitute for a pickle load, for training data!
    print "# Loading Features..."
    X_train, t_train, train_ids = pickle.load(open("../../features/all_tags/train.pickle"))
    X_test, t_test, test_ids = pickle.load(open("../../features/all_tags/test.pickle"))

    # separates the t_train only between 0 and 1, where 0 is None and 1 
    # is any Malware
    none = util.malware_classes.index("None")
    t_train_bin = [0 if x == none else 1 for x in t_train]
    t_test_bin = [0 if x == none else 1 for x in t_test]

    # train a Random Forest on the data, using a binary classification only
    # (between Malware and None)
    print "# Training RandomForestClassifier with n_estimators = {}, for a binary classification between Malware or None...".format(N)
    RFC_bin = RandomForestClassifier(n_estimators = N, n_jobs = -1)
    RFC_bin.fit(X_train, t_train_bin)

    print "# Predicting Malware vs None..."
    # predict whether the testation inputs are Malwares or Nones
    pred_bin = RFC_bin.predict(X_test)

    # fetch all datapoints that we considered as Malwares
    X_test_malware = []
    t_test_malware = []
    test_ids_malware = []

    for predicted, ID, true, features in zip(pred_bin, test_ids, t_test, X_test):
        # if we predicted None, this goes to our final prediction
        # otherwise, we add it to X_test_malware
        if predicted == 0:
            final_prediction.append(none)
            final_ids.append(ID)
        else:
            X_test_malware.append(features)
            t_test_malware.append(true)
            test_ids_malware.append(ID)

    # fetch all the Malwares
    X_train_malware = []
    t_train_malware = []

    for true, features in zip(t_train, X_train):
        if true != util.malware_classes.index("None"):
            X_train_malware.append(features)
            t_train_malware.append(true)

    np.asarray(X_train_malware)
    np.asarray(t_train_malware)

    print "# Training another RandomForestClassifier with n_estimators = {}, for a multi-class classification between only Malwares..."
    # train a Random Forest on the data, using now only the Malwares
    RFC_malware = RandomForestClassifier(n_estimators = 64, n_jobs = -1, class_weight = 'balanced')
    RFC_malware.fit(X_train_malware, t_train_malware)
    
    print "# Predicting whatever we had not classified as None before..."
    pred_malware = RFC_malware.predict(X_test_malware)

    for predicted, ID in zip(pred_malware, test_ids_malware):
        final_prediction.append(predicted)
        final_ids.append(ID)

    util.write_predictions(final_prediction, final_ids, "../../predictions/multi_classifier_predictions.csv")

    print "# Done!"
Ejemplo n.º 32
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--cross_validate',help='Run cross-validation (instead of of output)', action='store_true')
    parser.add_argument('-t', '--train_file', nargs=1, help='Training file')
    args = parser.parse_args()

    if args.train_file:
        trainfile = args.train_file[0]
    else:
        trainfile = "train.xml"
    testfile = "testcases.xml"
    outputfile = "mypredictions2.csv"  # feel free to change this or take it as an argument
    
    # put the names of the feature functions you've defined above in this list
    ffs = [metadata_feats, squared_terms]#, prod_company, review_score] #, review_terms] #, unigram_feats, threshold_terms]

    
    # extract features
    print "extracting training features..."
    X_train,global_feat_dict,y_train,train_ids = extract_feats(ffs, trainfile)
    global_feat_dict_sorted = sorted(global_feat_dict.iteritems(), key=operator.itemgetter(1))
    print global_feat_dict_sorted
    #print X_train.sum(axis=0)
    #print "1:",X_train[0]
    #print "2:",X_train[1]
    #print "3:",X_train[2]

    print "done extracting training features"
    print
    

    if args.cross_validate:
        print "running cross-validation tests..."
        score = crossvalidate.getScore(X_train,y_train, splinalg.lsqr)
        print "MAE cross validation score:",score
        print "done cross-validation"
    else:

        # write out predictions on test data

        # train here, and return regression parameters
        print "learning..."
        learned_w = splinalg.lsqr(X_train,y_train)[0]
        print '\n'.join(['%i: %8.8f %s' % 
                         (n, learned_w[n], global_feat_dict_sorted[n][0]) for n in xrange(len(learned_w))])
        '''
        preds = np.absolute(X_train.dot(learned_w))
        myfile = open('bb.txt','wb')
        wr = csv.writer(myfile, dialect='excel')
        for i in range(len(preds)):
            wr.writerow([i,X_train[i,0],X_train[i,1], y_train[i], preds[i]])
        '''
        print "done learning"
        print

        # get rid of training data and load test data
        del X_train
        del y_train
        del train_ids
        print "extracting test features..."
        X_test,_,y_ignore,test_ids = extract_feats(ffs, testfile, global_feat_dict=global_feat_dict)
        print "done extracting test features"
        print
        
        # make predictions on text data and write them out
        print "making predictions..."
        preds = np.absolute(X_test.dot(learned_w))
        # blockbuster correction factor
        for i in range(len(preds)):
            if X_test[i,1] > 50000000.0:
                preds[i] *= 0.85
        print "done making predictions"
        print
    
        print "writing predictions..."
        util.write_predictions(preds, test_ids, outputfile)
        print "done!"
Ejemplo n.º 33
0
X_sentiment_origin = X_sentiment.copy()

X_sentiment = X_sentiment_origin.copy()
y_train = y_train_origin.copy()

y_train = np.log(y_train)
mask = np.array(y_train > 14)
y_train = y_train[mask]
X_sentiment += 0.1
X_sentiment = np.log(X_sentiment[mask, :])
df = DataFrame(np.concatenate((y_train[:, np.newaxis], X_sentiment), axis=1))
scatter_matrix(df, alpha=0.2, figsize=(15, 15), diagonal='kde')
"""

"""
# TODO train here, and return regression parameters
print "learning..."
learned_w = splinalg.lsqr(X_train,y_train)[0]
print "done learning"
print

# get rid of training data and load test data
del X_train
del y_train
del train_ids
print "extracting test features..."
X_test,_,y_ignore,test_ids = extract_feats(ffs, testfile, global_feat_dict=global_feat_dict)
print "done extracting test features"
print
Ejemplo n.º 34
0
Archivo: rf.py Proyecto: ayoung01/cs181
                           random_state=None, verbose=0, min_density=None,
                           compute_importances=None)
erf.fit(X_train, y_train)
print "POST-selection oob\t%.4f" % (erf.oob_score_)
#print "Test oob\t%.4f" % erf.score(X_test, y_test)
pred_selected = erf.predict(X_test)

# Output predictions
#y_pred = erf.predict(X_test)

from sklearn.metrics import accuracy_score
print accuracy_score(pred_full, pred_selected)

ids = np.load(open('ids', 'rb'))
import util
util.write_predictions(pred_selected, ids, 'predictions/erf_80var.csv')

"""
pos = np.arange(sorted_idx.shape[0]) + .5

discard_bottom = 60
pos_plot = pos[discard_bottom:]
fi_plot = feature_importance[sorted_idx][discard_bottom:]
names_plot = feature_names[sorted_idx][discard_bottom:]
pl.subplot(1, 1, 1)
pl.barh(pos_plot, fi_plot, align='center')
pl.yticks(pos_plot, names_plot)
pl.xlabel('Relative Importance')
pl.title('Variable Importance RF')
pl.show()
"""
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "sample_predictions.csv"  # feel free to change this or take it as an argument
    # DONE put the names of the feature functions you've defined above in this list
    # we added all of our features engineering, and we also tried multiple pairing
    # of system calls, bigrams, trigrams and quadrigrams
    # We ran each of these separately. The validation accuracy is reporte in
    # the latex table.

    # ffs without any grams, only features engineering
    #    ffs = [first_last_system_call_feats, system_call_count_feats, system_load_dll_feats,
    #            system_open_key_feats, system_vm_protect_feats, system_dump_line_feats,
    #            system_delete_file_feats, system_remove_directory_feats, system_create_directory_feats]
    # bigram ffs
    ffs = [
        first_last_system_call_feats, system_call_count_feats,
        system_load_dll_feats, system_open_key_feats, system_vm_protect_feats,
        system_dump_line_feats, system_delete_file_feats,
        system_remove_directory_feats, system_create_directory_feats,
        system_bigrams_feats
    ]
    # trigrams ffs
    #    ffs = [first_last_system_call_feats, system_call_count_feats, system_load_dll_feats,
    #            system_open_key_feats, system_vm_protect_feats, system_dump_line_feats,
    #            system_delete_file_feats, system_remove_directory_feats, system_create_directory_feats,
    #            system_trigrams_feats]
    # quadrigrams ffs
    #    ffs = [first_last_system_call_feats, system_call_count_feats, system_load_dll_feats,
    #            system_open_key_feats, system_vm_protect_feats, system_dump_line_feats,
    #            system_delete_file_feats, system_remove_directory_feats, system_create_directory_feats,
    #            system_quadrigrams_feats]

    # extract features
    print("extracting training features...")
    X_train, global_feat_dict, t_train, train_ids = extract_feats(
        ffs, train_dir)
    print("done extracting training features")
    print()

    # split x_Train into a train (75%) and validation set (25%)
    a1, a2, a3, a4 = np.split(X_train.todense()[:][:3080], 4)
    X_train_train = sparse.csr_matrix(np.vstack((a1, a2, a3)))
    X_train_valid = sparse.csr_matrix(a4)
    # split t_Train into a train (75%) and validation set (25%)
    a1, a2, a3, a4 = np.split(t_train[:3080], 4)
    t_train_train = np.concatenate((a1, a2, a3))
    t_train_valid = a4
    # As we can see, our validation technique is a bit simplistic. We are only
    # taking the last 25% chunk of the data, and we know the data is acquired
    # over a few days so we are validating on the later days. This gives us a
    # somewhat skewed validation set. That being said, we decide to stick with
    # it because what really matters in this validation scheme is the overall
    # hierarchy of which models do better, because we then re-train the best
    # model on the full training set to make our predictions to Kaggle.

    # DONE train here, and learn your classification parameters
    # we train naive bayes, svm, random forest and gradient boosted trees on
    # all three cases, bigrams, trigrams and quadrigrams
    print("learning...")
    # We first start with a multinomial naive bayes classifier, since this is
    # something we learned with the generative models, and it is a linear
    # model and thus relatively simple
    # 1nd - Multinomial Naive Bayes Classifier
    model_mnb = MultinomialNB()
    model_mnb.fit(X_train_train, t_train_train)
    x1 = categ_accuracy(model_mnb, X_train_valid, t_train_valid)
    print("MNB Classifier Accuracy: " + str(x1))
    # the result are not great, so we try a more complex linar model

    # next, we wanted to try an SVM sinc we started learning about SVM and it
    # is a slightly more complex and generalizable linear model
    # 2st - SVM Classifier
    model_svm = svm.SVC()  # SVM Classifier
    model_svm.fit(X_train_train, t_train_train)
    x0 = categ_accuracy(model_svm, X_train_valid, t_train_valid)
    print("SVM Classifier Accuracy: " + str(x0))
    # the results are much better, but can still probably be better. We turn to
    # ensemble methods

    # next, we wanted to try a random forest classifier as an easier-to-train
    # non-linear model that worked really well on last practical
    # 3rd - Random Forest Classifier
    model_rf = RandomForestClassifier()
    model_rf.fit(X_train_train, t_train_train)
    x2 = categ_accuracy(model_rf, X_train_valid, t_train_valid)
    print("RF  Classifier Accuracy: " + str(x2))
    # this is quite good! We nonetheless try one last model

    # finally, we try gradient boosting trees as a supposedly better random forest
    # 4th - Gradient Boosting Classifier
    model_gb = GradientBoostingClassifier()
    model_gb.fit(X_train_train, t_train_train)
    x3 = categ_accuracy(model_gb, X_train_valid, t_train_valid)
    print("GB  Classifier Accuracy: " + str(x3))
    # does about the same as random forest, but not really better. let's see
    # what happens after doing grid search to optimize parameters.

    # The best appears to be a random forest, so we will use grid search
    # to improve the parameters
    # we save the best parameters here
    best_model_parameters = None
    # this is about the score from our random parameters, it will be our benchmark
    # to improve with grid search
    best_acc = 0  # zero initial accuracy
    # range 1 to 260 by steps of 20 for each parameter
    for depth in [1, 20, 40, 60]:
        for n_estimators in [
                1, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220
        ]:
            # setting up the model parameters dict
            model_parameters = {
                'max_depth': depth,
                'random_state': 0,
                'n_estimators': n_estimators
            }
            optimized_model_rf = RandomForestClassifier(**model_parameters)
            # running cross validation for the model with the parameters
            optimized_model_rf.fit(X_train_train, t_train_train)
            val_acc = categ_accuracy(optimized_model_rf, X_train_valid,
                                     t_train_valid)
            # updating the best parameters if the avg validation accuracy is better
            if val_acc > best_acc:
                best_acc = val_acc
                best_model_parameters = model_parameters
    # display the best parameters and the associated accuracy
    print('Best RF/GB model parameters:', best_model_parameters)
    print('Best RF Classifier Accuracy:', best_acc)

    # doing grid search for gradient boosting is too computationally consuming.
    # However, we know that gradient boosting trees are essentially a boosted
    # random forest, and we can see that from the fact that their accuracy scores
    # are very similar. Therefore, we infer that optimal parameters for random
    # forest will be similar to optimal parameter for gradient boosted trees
    # and use the same optimal parameters.

    # train our model with optimal parameters from random forest grid search
    optimized_model_gb = GradientBoostingClassifier(**best_model_parameters)
    optimized_model_gb.fit(X_train_train, t_train_train)
    val_acc_gb = categ_accuracy(optimized_model_gb, X_train_valid,
                                t_train_valid)
    # display the best accuracy
    print('Best GB Classifier Accuracy:', val_acc_gb)

    # As we can see (as listed in the report), we get better accuracy for
    # a normal random forest classifier for all types of grams, so we simply
    # use a ranadom forest for the final training and for our submission.
    # We can also see that bigrams provide the best accuracy, so we stick to
    # bigrams for our predictions submission.

    # RF is the highest model we get now, with about 0.87 so we'll train
    # a RF on the full data
    # We use random forrest for predictions. Here, we retrain a new random
    # forest model on the full training data set (so to have a better model than
    # the one we used to get accuracy) and use that for submission
    model_rf_all_opt = RandomForestClassifier(**best_model_parameters)
    model_rf_all_opt.fit(X_train, t_train)
    print("done learning")
    print()

    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print("extracting test features...")
    X_test, _, t_ignore, test_ids = extract_feats(
        ffs, test_dir, global_feat_dict=global_feat_dict)
    print("done extracting test features")
    print()

    # make predictions on text data and write them out
    print("making predictions...")
    preds = model_rf_all_opt.predict(X_test)
    print("done making predictions")
    print()
    print("writing predictions...")
    util.write_predictions(preds, test_ids, outputfile)
    print("done!")
Ejemplo n.º 36
0
import math
import numpy as np
import books
import visualize
import mf
import util
import shared_utils as su


# do this once to build the ratings and save them to ratings_tuple_std
books.build_ratings(filename="ratings_tuple_std", standardize=True, withhold=20000)

# load training data
data_train = su.unpickle("ratings_tuple_std")

# choose a number of features, limit the time the simulation runs
K = 5
max_steps = 2 # change this to something reasonable, like 200 or 500

# update this for each trial you do with a particular k
run = 0

data_mfact = mf.mfact(data_train["ratings"], data_train["N"], data_train["D"], \
	K, steps=max_steps, filename=("mfact_%d_run_%d" % (K, run)))

# make some predictions
predictions = books.make_predictions(data_train, data_mfact)

# write the predictions
util.write_predictions(predictions,("predictions_%d_run_%d.csv" % (K, run)))
Ejemplo n.º 37
0
import numpy as np
import ensemble
from train_model_library import train_model_library
import util
import makePred

ensemble_library_pred, validation_labels, scaler, model_grid = train_model_library(
    n_folds_to_compute=1)

ensemble, acc, n, c1acc = ensemble.generate_ensemble(ensemble_library_pred,
                                                     validation_labels,
                                                     n_init=3,
                                                     tolerance=.00001)

ids, features = util.load_test("kaggle_test_tf_idf_l1_norm.csv")
labels = makePred.makePrediction(ensemble, model_grid, features, scaler)

util.write_predictions(labels, "idflabels_lean_2.csv")

print("done")
Ejemplo n.º 38
0
tree_raw = np.loadtxt('predictions/syscall_count_by_type-1.csv', dtype=str,
                        delimiter=';')
tree_reader = csv.reader(tree_raw, delimiter=',')
tree_reader.next()
preds = []
ids = []
for row in tree_reader:
    f_id = row[0]
    tree_pred = int(row[1])
    if PROPS[tree_pred] < PROPS[log_preds[f_id]]:
        preds.append(tree_pred)
    else:
        preds.append(log_preds[f_id])
    ids.append(f_id)

util.write_predictions(preds, ids, 'predictions/combined-2.csv')

'''mat,_,cat = pickle.load(open('matrix_train', 'rb'))
mats,cats = extract.split_data(mat,cat,7)
correct = 0.
for i in range(7):
    train_mats = [mats[j] for j in range(7) if not i == j]
    train_cats = [cats[j] for j in range(7) if not i == j]
    train_mat, train_cat = extract.join_data(train_mats, train_cats)
    test_mat, test_cat = mats[i], cats[i]

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(train_mat, train_cat)
    tree_preds = clf.predict(test_mat)

    logreg = linmod.LogisticRegression()
Ejemplo n.º 39
0
import numpy as np
import util

# This is just about the dumbest possible predictor, but it shows the
# really basic things you need to know to read in the training data
# and write a valid prediction file.

pred_filename  = 'pred-global-mean.csv'
train_filename = 'ratings-train.csv'
test_filename  = 'ratings-test.csv'

training_data  = util.load_train(train_filename)
test_queries   = util.load_test(test_filename)

# Compute the mean rating.
num_train = len(training_data)
mean_rating = float(sum(map(lambda x: x['rating'], training_data)))/num_train
print "The mean rating is %0.3f." % (mean_rating)

# Use the global mean to make predictions.
# Iterate over the test set and add a 'rating' dictionary element.
for query in test_queries:
    query['rating'] = mean_rating

# Write the prediction file.
util.write_predictions(test_queries, pred_filename)



Ejemplo n.º 40
0
def main(load=False, test=False, both=False):
    train_dir = "train"
    test_dir = "test"
    outputfile = "treepredictions.csv"  # feel free to change this or take it as an argument

    if not load:
        # extract features
        print "extracting training features..."
        X_train, global_feat_dict, t_train, train_ids = extract_feats(
            ffs, train_dir)
        print "done extracting training features"
        print
        print "Saving features"
        with open("X_train", "w") as out:
            pickle.dump(X_train, out)
        with open("global_feat_dict", "w") as out:
            pickle.dump(global_feat_dict, out)
        with open("t_train", "w") as out:
            pickle.dump(t_train, out)
        with open("train_ids", "w") as out:
            pickle.dump(train_ids, out)
        print "Done saving"
    else:
        print "Loading previous features"
        with open("X_train", "r") as out:
            X_train = pickle.load(out)
        with open("global_feat_dict", "r") as out:
            global_feat_dict = pickle.load(out)
        with open("t_train", "r") as out:
            t_train = pickle.load(out)
        with open("train_ids", "r") as out:
            train_ids = pickle.load(out)
        print "Done loading"
        print

    # if we're verifying things, save some test data
    if not test:
        print "Getting holdout data..."
        Xs, ts, ids = (X_train, t_train, train_ids)
        n = Xs.shape[0]
        train_pct = 0.8

        X_train = Xs[-int(n * train_pct):]
        t_train = ts[-int(n * train_pct):]
        train_ids = ids[-int(n * train_pct):]

        X_holdout = Xs[:-int(n * train_pct)]
        t_holdout = ts[:-int(n * train_pct)]
        holdout_ids = ids[:-int(n * train_pct)]
        print
    # TODO train here, and learn your classification parameters
    print "learning..."
    num_trees = 100
    forest = RandomForestClassifier(n_estimators=num_trees)
    forest = forest.fit(X_train.todense(), t_train)
    # Random forest predictor
    forest_predictor, _ = sk_random_forest(X_train.toarray(),
                                           t_train,
                                           num_trees=num_trees)
    # logistic regression predictor
    # log_predictor, _ = sk_logistic(X_train, t_train)
    print "done learning"
    print

    # get rid of training data and load test data
    # del X_train
    # del t_train
    # del train_ids

    # if you want to write predictions for test data
    if test:
        # if you didn't save both sets of features, extract
        if not both:
            print "extracting test features..."
            X_test, _, t_ignore, test_ids = extract_feats(
                ffs, test_dir, global_feat_dict=global_feat_dict)
            print "done extracting test features"
            print
            print "Saving test features"
            with open("X_test", "w") as out:
                pickle.dump(X_test, out)
            with open("test_ids", "w") as out:
                pickle.dump(test_ids, out)
            print "Done saving"
            print
        else:
            print "Loading previous test features"
            with open("X_test", "r") as out:
                X_test = pickle.load(out)
            with open("test_ids", "r") as out:
                test_ids = pickle.load(out)
            print "Done loading"
            print
        # TODO make predictions here
        print "making predictions..."
        preds = forest.predict(X_test.toarray())
        print "done making predictions"

        print "writing predictions..."
        util.write_predictions(preds, test_ids, outputfile)
        print "done!"
    else:
        error = 0
        total = X_holdout.shape[0]
        print "making predictions..."
        #preds = np.argmax(X_test.dot(learned_W),axis=1)
        #preds = logreg.predict(X_test)
        random.seed(datetime.now())
        for index, feats in enumerate(X_holdout.toarray()):
            pred_forest = forest_predictor(feats)
            # pred_logistic = log_predictor(feats)
            # #if they agree, or disagree and both predict malware
            # if pred_forest == pred_logistic or (pred_forest != 8 and pred_logistic != 8):
            #     prediction = pred_forest
            # else:
            #     # grab the non-"None" label
            #     other = pred_forest if pred_forest != 8 else pred_logistic
            #     # flip a coin
            #     if random.random() < 0.39:
            #         prediction = 8
            #     else:
            #         prediction = other
            prediction = pred_forest

            if (prediction != t_holdout[index]):
                print "%s: expected %d but got %d" % (
                    holdout_ids[index], t_holdout[index], prediction)
                error += 1
        print "Correct: %d, Incorrect: %d, Total: %d, Accuracy: %f" % (
            total - error, error, total, (total - error) / (1.0 * total))
        print "done making predictions"
        print
    print
Ejemplo n.º 41
0
print "done extracting training features"
print

# TODO train here, and return regression parameters
print "learning..."
learned_w = splinalg.lsqr(X_train,y_train)[0]
print "done learning"
print

# get rid of training data and load test data
del X_train
del y_train
del train_ids
print "extracting test features..."
X_test,_,y_ignore,test_ids = extract_feats(ffs, testfile, global_feat_dict=global_feat_dict)
print "done extracting test features"
print

# TODO make predictions on text data and write them out
print "making predictions..."
preds = X_test.dot(learned_w)
print "done making predictions"
print

print "writing predictions..."
util.write_predictions(preds, test_ids, outputfile)
print "done!"

import pickle
pickle.dump(test_ids, open('test_ids.p','wb'))
Ejemplo n.º 42
0
    def GetFitSolutionResults(self, request, context):
        """
        TA2-3 API call
        """
        logging.info("Message received: GetFitSolutionResults")
        request_id = request.request_id
        request_params = self._solution_score_map[request_id]
        start=solutiondescription.compute_timestamp()

        solution_id = request_params.solution_id

        if solution_id not in self._solutions:
            logging.info("GetFitSolutionResults: Solution %s not found!", solution_id)
            msg = core_pb2.Progress(state=core_pb2.ERRORED, status="", start=start, end=solutiondescription.compute_timestamp())
            # Clean up
            self._solution_score_map.pop(request_id, None)
            yield core_pb2.GetFitSolutionResultsResponse(progress=msg, steps=[], exposed_outputs=[], fitted_solution_id=None)
        else:
            solution = self._solutions[solution_id]

            msg = core_pb2.Progress(state=core_pb2.RUNNING, status="", start=start, end=solutiondescription.compute_timestamp())
            
            fitted_solution = copy.deepcopy(solution)
            fitted_solution.id = str(uuid.uuid4())
            fitted_solution.create_pipeline_json(self._primitives) 
            self._solutions[fitted_solution.id] = fitted_solution

            inputs = self._get_inputs(solution.problem, request_params.inputs)
            try:
                output = fitted_solution.fit(inputs=inputs, solution_dict=self._solutions)
            except:
                logging.info(fitted_solution.primitives)
                logging.info(sys.exc_info()[0])
                output = None

            result = None
            outputDir = os.environ['D3MOUTPUTDIR']

            if isinstance(output, np.ndarray):
                output = pd.DataFrame(data=output)

            if output is not None:
                uri = util.write_predictions(output, outputDir + "/predictions", fitted_solution)
                uri = 'file://{uri}'.format(uri=os.path.abspath(uri)) 
                result = value_pb2.Value(csv_uri=uri)
            else:
                result = value_pb2.Value(error = value_pb2.ValueError(message="Output is NULL"))

            yield core_pb2.GetFitSolutionResultsResponse(progress=msg, steps=[], exposed_outputs=[], fitted_solution_id=fitted_solution.id)

            msg = core_pb2.Progress(state=core_pb2.COMPLETED, status="", start=start, end=solutiondescription.compute_timestamp())

            steps = []
            for i in range(fitted_solution.num_steps()):
                steps.append(core_pb2.StepProgress(progress=msg))

            exposed_outputs = {}
            if request_params.expose_outputs is not None and len(request_params.expose_outputs) > 0:
                last_step_output = request_params.expose_outputs[len(request_params.expose_outputs)-1]
            else:
                last_step_output = fitted_solution.outputs[0][2]

            exposed_outputs[last_step_output] = result

            # Clean up
            self._solution_score_map.pop(request_id, None)

            yield core_pb2.GetFitSolutionResultsResponse(progress=msg, steps=steps, exposed_outputs=exposed_outputs, fitted_solution_id=fitted_solution.id)
Ejemplo n.º 43
0
                                bidir=True)
if using_GPU:
    RNNseq_model = RNNseq_model.cuda()
    state_dict = torch.load(args.rnn_model_path)['state_dict']
else:
    state_dict = torch.load(args.rnn_model_path,
                            map_location='cpu')['state_dict']
# create new OrderedDict that does not contain `module.`
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    name = k[7:]  # remove `module.`
    new_state_dict[name] = v
# load params
RNNseq_model.load_state_dict(new_state_dict)
result = write_predictions(raw_test_rcc, test_dataloader_rcc, RNNseq_model,
                           using_GPU, args.not_found_test_path)
logging.info("Write predictions to {}".format(args.not_found_test_path))
f = open(args.not_found_test_path, 'w')
writer = csv.writer(f)
writer.writerows(result)
f.close()
logging.info("*" * 25 + " Mention Labeling By LSTM tagging model " + "*" * 25)

##############
# classfying #
##############
logging.info("*" * 25 + " Dataset Recognition By CNN Text Classifier " +
             "*" * 25)
args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
logging.info("Loading tagFile")
idx_to_class = {}
Ejemplo n.º 44
0
def main():
    priors = [
        .0369, .0162, .012, .0103, .0133, .0126, .0172, .0133, .5214, .0068,
        .1756, .0104, .1218, .0191, .013
    ]

    ##########################
    ####System Counts#########
    ##########################
    # define global set for creating data frames
    # test_tree_list, test_classes, test_ids = extract_tree("test")
    # globalSetTest = set()
    # dictListTest = list()
    # for tree in test_tree_list:
    #     dictListTest.append(perSysCallCount(tree, globalSetTest))

    # train_tree_list, train_classes, train_ids = extract_tree("train")
    # dictListTrain = list()
    # for tree in train_tree_list:
    #     dictListTrain.append(perSysCallCount(tree, globalSetTest))

    # newPerSysCallCountFile(dictListTest,test_classes, test_ids, "perSysCountsTest.csv", globalSetTest)
    # newCountFile(test_tree_list, test_classes, test_ids, "choppyTest.csv")
    # del test_tree_list,test_classes,dictListTest,test_ids

    # newPerSysCallCountFile(dictListTrain,train_classes,train_ids, "perSysCountsTrain.csv",globalSetTest)
    #newCountFile(train_tree_list, train_classes, train_ids, "choppyTrain.csv")
    # del train_tree_list,train_classes,train_ids,dictListTrain

    ###############################################
    #######Per-Tree, Per-System Call Counts########
    ###############################################
    """
    Read in train and test as Pandas DataFrames
    """
    # df_train = pd.read_csv("choppyTrain.csv")
    # df_test = pd.read_csv("choppyTest.csv")
    df_train = pd.read_csv("perSysCountsTrain.csv")
    df_test = pd.read_csv("perSysCountsTest.csv")
    #store class values
    Y_train = df_train.Class.values
    testID = df_test.Id.values
    #row where testing examples start
    test_idx = df_train.shape[0]
    df_all = pd.concat((df_train, df_test), axis=0)
    del df_train
    del df_test
    df_all = df_all.drop(['Id'], axis=1)
    df_all = df_all.drop(['Class'], axis=1)
    vals = df_all.values
    del df_all
    X_train = vals[:test_idx]
    X_test = vals[test_idx:]
    del vals
    # clf = bnb(class_prior=priors)
    # clf.fit(X_train, Y_train)
    clf = mnb(class_prior=priors)
    clf.fit(X_train, Y_train)
    del X_train
    del Y_train
    # bnb_predict = clf.predict(X_test)
    mnb_predict = clf.predict(X_test)
    # util.write_predictions(bnb_predict,test_ids,"ChoppySingleBNB.csv")
    util.write_predictions(mnb_predict, testID, "PerSysCallCountsBNB.csv")
Ejemplo n.º 45
0
clf = SGDClassifier(penalty = 'elasticnet', alpha = 0.000001)
clf.fit(X, y)
print clf.score(X, y)



# Output predictions
X_test = np.load(open('x_test', 'rb'))
from sklearn.preprocessing import StandardScaler
X_test = np.log(X_test + 1)
X_test = StandardScaler().fit_transform(X_test)

y_pred = clf.predict(X_test)
ids = np.load(open('ids', 'rb'))
import util
util.write_predictions(y_pred, ids, 'predictions/rf_pc10.csv')


"""
from sklearn.ensemble import ExtraTreesClassifier
erf = ExtraTreesClassifier(n_estimators=300, max_features='auto',
                           bootstrap=True, oob_score=True,
                           criterion='gini',
                           max_depth=None, min_samples_split=2,
                           min_samples_leaf=1,
                           n_jobs=1,
                           random_state=None, verbose=0, min_density=None,
                           compute_importances=None)
erf.fit(X_train, y_train)
print "oob\t%.4f" % (erf.oob_score_)
"""
Ejemplo n.º 46
0
def main():
    train_dir = "../../../Data/train"
    test_dir = "../../../Data/test"
    outputfile = "../../Output/Jeremiah.csv"  # feel free to change this or take it as an argument
 
    ################################
    #### Empirical Summary 
    ################################

    #Get types & fre of commands
    
    #raw count
    lesNames =  call_freq_emp(train_dir)
    lesNames_freq = pd.Series(lesNames.values(), lesNames.keys())
    lesNames_freq = lesNames_freq/sum(lesNames_freq)
    lesNames_freq.sort()
    lesNames_freq

    #raw count by Type
    lesNames_byType =  call_freq_byType(train_dir)
    
    lesNames_byType_freq = dict()
    for TypeName in util.malware_classes:
        lesNames_byType_freq[TypeName] = []
    
    for keyName in lesNames_byType.keys():
        namez = lesNames_byType[keyName]
        namez_freq = pd.Series(namez.values(), namez.keys())
        namez_freq = namez_freq/sum(namez_freq)
        namez_freq.sort(ascending = False)
        lesNames_byType_freq[keyName] = namez_freq[namez_freq > 0.01]
        print(keyName + " Finished!")
        sys.stdout.flush()
    
    lesNames_byType_freq #most frequent commands in each class
    
    
    ##Bar plot
    t_label = np.array(util.malware_classes)[np.array(t_train)] 
    
    
    bar_df = stats.itemfreq(t_label).T
    bar_df = stats.itemfreq(t_train).T
    
    
    bar_df = pd.DataFrame(data= bar_df).T
    bar_df.columns = ['name', 'count']
    bar_df[['count']] = bar_df[['count']].astype(int)
    
    ggplot(aes(x = "name", weight = "count"), bar_df) + \
           xlab("count") + geom_bar() + \
           ggtitle("Frequency Count for Malware Types")
    
    
    
    ################################
    #### Feature Extraction and Prunning 
    ################################

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [first_last_system_call_feats, system_call_count_feats, \
            call_freq, dll_type]#, get_all_keys]

    # extract features
    print "extracting training features..."
    X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
    X_train_dense = X_train.todense()
    del X_train
    print "done extracting training features"
    print
    sys.stdout.flush()
    
    #prunning
    X_train_prune, global_feat_dict_prune, featureFreq, prunId = \
    pruneFeatures(minFreq = 1, X_train_dense = X_train_dense, \
                    global_feat_dict = global_feat_dict)

    ################################
    #### CV-based training 
    ################################
    n = X_train_dense.shape[0]
    nForest = 1000
    n_cv = 5
    
    print str(n_cv) + " fold learning initiated..."
    eRate_cv = []
    kf_cv = cv.KFold(n, n_folds = n_cv)
    clf_cv = es.RandomForestClassifier(n_estimators = nForest)
    i = 0
    
    for train_index, test_index in kf_cv:
        i += 1
        #create CV dataset and fit
        F_train, F_test = X_train_prune[train_index], X_train_prune[test_index]
        y_train, y_test = t_train[train_index], t_train[test_index]
        clf_fit = clf_cv.fit(F_train, y_train)
        #prediction
        clf_pred = clf_fit.predict(F_test)
        accuracy = Accuracy(clf_pred, y_test)[0]
        eRate_cv.append(accuracy)
        print("Fold " + str(i) + " Classification Accuracy = " + str(accuracy))
        sys.stdout.flush()
    print "done learning"
    print
    
    np.mean(eRate_cv)

    ################################
    #feature importance assessment: 
    ################################
    # train here, and learn your classification parameters
    print "learning..."
    nForest = 1000
    clf = es.RandomForestClassifier(n_estimators = nForest, \
                verbose = 1, n_jobs = -1)
    clf_fit = clf.fit(X_train_dense, t_train)
    print "done learning"
    print

    #TODO: Figure out param Name that Feature Importance corresponds to
    ftImp = pd.DataFrame(sorted(global_feat_dict.keys()), \
                            columns = ["Name"])
    ftImp["FeatureImp"] = clf_fit.feature_importances_
    ftImp_s = ftImp.sort(columns = "FeatureImp", ascending = False)

    print_full(ftImp_s)
    ftImp_s.loc[ ftImp_s['FeatureImp']> 0.000, :]

    ####################################
    # in sample prediction and mis-classification rate
    ####################################
    
    print "making in-sample predictions..."
    clf_preds = clf_fit.predict(X_train_dense)
    clf_missId = ((clf_preds - t_train) != 0)
    clf_miss = t_train[(clf_preds - t_train) != 0]
    
    rate = 1 - np.mean(clf_missId) #error rate
    
    clf_miss = [util.malware_classes[i] for i in clf_miss]
    stats.itemfreq(clf_miss)
    print "done making in-sample predictions"
    
    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print "extracting test features..."
    X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
    X_test_dense = X_test.todense()
    X_test_prune = X_test_dense.T[prunId].T
    
    print "done extracting test features"
    print
    
    # TODO make predictions on text data and write them out
    print "making predictions..."
    clf_preds = clf_fit.predict(X_test_prune)
    print "done making predictions"
    print
    
    print "writing predictions..."
    util.write_predictions(clf_preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 47
0
def main(load = False, test=False):
    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"  # feel free to change this or take it as an argument
    
    if not load:
        # extract features
        print "extracting training features..."
        X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
        print "done extracting training features"
        print
        print "Saving features"
        with open("X_train", "w") as out:
            pickle.dump(X_train, out)
        with open("global_feat_dict", "w") as out:
            pickle.dump(global_feat_dict, out)
        with open("t_train", "w") as out:
            pickle.dump(t_train, out)
        with open("train_ids", "w") as out:
            pickle.dump(train_ids, out)
        print "Done saving"
        print
    else:
        print "Loading previous features"
        with open("X_train", "r")           as out: X_train =           pickle.load(out)

        with open("global_feat_dict", "r")  as out: global_feat_dict =  pickle.load(out)

        with open("t_train", "r")           as out: t_train =           pickle.load(out)

        with open("train_ids", "r")         as out: train_ids =         pickle.load(out)
        print "Done loading"
        print
    
    # TODO train here, and learn your classification parameters
    print "learning..."
    predictor, _ = sk_logistic(X_train, t_train)
    # distribs = train_generative(X_train, t_train, len(global_feat_dict))
    # Start with logistic regression
    print "done learning"
    print
    
    # get rid of training data and load test data
    # del X_train
    # del t_train
    # del train_ids
    print "extracting test features..."
    X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print
    
    # TODO make predictions on text data and write them out
    error = 0
    total = X_train.shape[0]
    print "making predictions..."
    # preds = np.argmax(X_test.dot(learned_W),axis=1)
    # preds = gen_classifier(X_train, distribs)
    # for t_id, p, t in zip(train_ids, preds, t_train):
    #     if (p != t):
    #         print "%s: expected %d but got %d" % (t_id, t, p)
    #         error += 1
    if test:
        preds = []
        for x in X_test:
            preds.append(predictor(x))
    else:
        for index, feats in enumerate(X_train):
            prediction = predictor(feats)
            if (prediction != t_train[index]):
                print "%s: expected %d but got %d" % (train_ids[index], t_train[index], prediction)
                error += 1
        print "Correct: %d, Incorrect: %d, Total: %d, Accuracy: %f" % (total - error, error, total, (total - error) / (1.0 * total))
    print "done making predictions"
    print
    
    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 48
0
def main(bayesian=False):
    outputfile = "match_predictions.csv"
    actualfile = "match_actual.csv"

    start_date = datetime.datetime(2003, 1, 1)
    middle_date = datetime.datetime(2012, 1, 1)
    middle_date_plus_one = datetime.datetime(2013, 1, 1)
    end_date = datetime.datetime(2014, 1, 1)

    print "extracting training features..."
    # bayesian technique uses all dates
    # for RMSE, we need train and test set
    if not bayesian:
        # use middle_date when you desire 2013 and 2014 for the test set
        X_train, feat_name_to_col_num, y_train, train_ids = extract_feats(start_date, middle_date_plus_one)
    else:
        X_train, feat_name_to_col_num, y_train, train_ids = extract_feats(start_date, end_date)
    print "done extracting training features"
    print

    print "learning..."
    prior_mean = np.zeros(len(feat_name_to_col_num))
    prior_cov = np.identity(len(feat_name_to_col_num))
    prior_hessian = np.linalg.inv(prior_cov)
    posterior_mean, posterior_hessian = PoissonRegression.fit_bayes_poisson(y_train, X_train.toarray(),
                                                                            prior_mean, prior_hessian)
    print "done learning"

    print "Learned Coeffs: " + str(posterior_mean)
    print "Learned Correlations: " + str(posterior_hessian)

    # only need test set when not fully bayesian - i.e. when you want RMSE
    if not bayesian:
        del X_train
        del y_train
        del train_ids

        print "extracting test features..."
        X_test, _, y_ignore, test_ids = extract_feats(end_date, end_date, feat_name_to_col_num=feat_name_to_col_num)
        print "done extracting test features"
        print

    print "making predictions..."
    if not bayesian:
        preds = np.exp(X_test.toarray().dot(posterior_mean))
    else:
        preds = np.exp(X_train.toarray().dot(posterior_mean))
    print "done making predictions"
    print

    print "writing predictions..."
    if not bayesian:
        util.write_predictions(preds, test_ids, outputfile)
        util.write_predictions(y_ignore, test_ids, actualfile)
    else:
        util.write_predictions(preds, train_ids, outputfile)
        util.write_predictions(y_train, train_ids, actualfile)
    print "done writing"

    print "RMSE: " + str(mae.rmse())

    # get model evidence only when fully bayesian
    if bayesian:
        print "Marginal Likelihood: " + str(PoissonRegression.get_model_evidence(posterior_mean, prior_mean,
                                                                                 prior_hessian, y_train,
                                                                                 X_train.toarray()))

    print "Feature dictionary: " + str(feat_name_to_col_num)
    print "P-values: " + str(PoissonRegression.get_pvalues(posterior_mean, posterior_hessian))
def main():
    train_dir = "../../../Data/train"
    test_dir = "../../../Data/test"
    outputfile = "../../Output/Jeremiah.csv"  # feel free to change this or take it as an argument

    ################################
    #### Empirical Summary
    ################################

    #Get types & fre of commands

    #raw count
    lesNames = call_freq_emp(train_dir)
    lesNames_freq = pd.Series(lesNames.values(), lesNames.keys())
    lesNames_freq = lesNames_freq / sum(lesNames_freq)
    lesNames_freq.sort()
    lesNames_freq

    #raw count by Type
    lesNames_byType = call_freq_byType(train_dir)

    lesNames_byType_freq = dict()
    for TypeName in util.malware_classes:
        lesNames_byType_freq[TypeName] = []

    for keyName in lesNames_byType.keys():
        namez = lesNames_byType[keyName]
        namez_freq = pd.Series(namez.values(), namez.keys())
        namez_freq = namez_freq / sum(namez_freq)
        namez_freq.sort(ascending=False)
        lesNames_byType_freq[keyName] = namez_freq[namez_freq > 0.01]
        print(keyName + " Finished!")
        sys.stdout.flush()

    lesNames_byType_freq  #most frequent commands in each class

    ##Bar plot
    t_label = np.array(util.malware_classes)[np.array(t_train)]

    bar_df = stats.itemfreq(t_label).T
    bar_df = stats.itemfreq(t_train).T

    bar_df = pd.DataFrame(data=bar_df).T
    bar_df.columns = ['name', 'count']
    bar_df[['count']] = bar_df[['count']].astype(int)

    ggplot(aes(x = "name", weight = "count"), bar_df) + \
           xlab("count") + geom_bar() + \
           ggtitle("Frequency Count for Malware Types")

    ################################
    #### Feature Extraction and Prunning
    ################################

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [first_last_system_call_feats, system_call_count_feats, \
            call_freq, dll_type]#, get_all_keys]

    # extract features
    print "extracting training features..."
    X_train, global_feat_dict, t_train, train_ids = extract_feats(
        ffs, train_dir)
    X_train_dense = X_train.todense()
    del X_train
    print "done extracting training features"
    print
    sys.stdout.flush()

    #prunning
    X_train_prune, global_feat_dict_prune, featureFreq, prunId = \
    pruneFeatures(minFreq = 1, X_train_dense = X_train_dense, \
                    global_feat_dict = global_feat_dict)

    ################################
    #### CV-based training
    ################################
    n = X_train_dense.shape[0]
    nForest = 1000
    n_cv = 5

    print str(n_cv) + " fold learning initiated..."
    eRate_cv = []
    kf_cv = cv.KFold(n, n_folds=n_cv)
    clf_cv = es.RandomForestClassifier(n_estimators=nForest)
    i = 0

    for train_index, test_index in kf_cv:
        i += 1
        #create CV dataset and fit
        F_train, F_test = X_train_prune[train_index], X_train_prune[test_index]
        y_train, y_test = t_train[train_index], t_train[test_index]
        clf_fit = clf_cv.fit(F_train, y_train)
        #prediction
        clf_pred = clf_fit.predict(F_test)
        accuracy = Accuracy(clf_pred, y_test)[0]
        eRate_cv.append(accuracy)
        print("Fold " + str(i) + " Classification Accuracy = " + str(accuracy))
        sys.stdout.flush()
    print "done learning"
    print

    np.mean(eRate_cv)

    ################################
    #feature importance assessment:
    ################################
    # train here, and learn your classification parameters
    print "learning..."
    nForest = 1000
    clf = es.RandomForestClassifier(n_estimators = nForest, \
                verbose = 1, n_jobs = -1)
    clf_fit = clf.fit(X_train_dense, t_train)
    print "done learning"
    print

    #TODO: Figure out param Name that Feature Importance corresponds to
    ftImp = pd.DataFrame(sorted(global_feat_dict.keys()), \
                            columns = ["Name"])
    ftImp["FeatureImp"] = clf_fit.feature_importances_
    ftImp_s = ftImp.sort(columns="FeatureImp", ascending=False)

    print_full(ftImp_s)
    ftImp_s.loc[ftImp_s['FeatureImp'] > 0.000, :]

    ####################################
    # in sample prediction and mis-classification rate
    ####################################

    print "making in-sample predictions..."
    clf_preds = clf_fit.predict(X_train_dense)
    clf_missId = ((clf_preds - t_train) != 0)
    clf_miss = t_train[(clf_preds - t_train) != 0]

    rate = 1 - np.mean(clf_missId)  #error rate

    clf_miss = [util.malware_classes[i] for i in clf_miss]
    stats.itemfreq(clf_miss)
    print "done making in-sample predictions"

    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print "extracting test features..."
    X_test, _, t_ignore, test_ids = extract_feats(
        ffs, test_dir, global_feat_dict=global_feat_dict)
    X_test_dense = X_test.todense()
    X_test_prune = X_test_dense.T[prunId].T

    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."
    clf_preds = clf_fit.predict(X_test_prune)
    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(clf_preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 50
0
import numpy
import util
import shared_utils

pred_filename  = 'pred-user-hamming.csv'
train_filename = 'ratings-train.csv'
test_filename  = 'ratings-test.csv'

training_data  = util.load_train(train_filename)
test_queries   = util.load_test(test_filename)

user_common_books = shared_utils.unpickle('user_common_books')
user_difference_ratings = shared_utils.unpickle('user_difference_ratings')

print user_common_books[0:]
print user_difference_ratings[0:]

ratings_filename = 'ratings_std'
mother = shared_utils.unpickle(ratings_filename)
'''
for query in test_queries:
	user = query['user']
	user_cluster = numpy.dot(R[user - 1,:], range(k))
	isbn = query['isbn']
	book_index = mother['book_isbn_to_index'][isbn]
	query['rating'] = U[user_cluster][book_index] * mother['variance'] + mother['mean']

util.write_predictions(test_queries, pred_filename)
'''
Ejemplo n.º 51
0
# Store data for each user to keep track of the per-user average.
users = {}
for user in user_list:
    users[user['user']] = {
        'total': 0,  # For storing the total of ratings.
        'count': 0,  # For storing the number of ratings.
    }

# Iterate over the training data to compute means.
for rating in training_data:
    user_id = rating['user']
    users[user_id]['total'] += rating['rating']
    users[user_id]['count'] += 1

# Make predictions for each test query.
for query in test_queries:

    user = users[query['user']]

    if user['count'] == 0:
        # Perhaps we did not having any ratings in the training set.
        # In this case, make a global mean prediction.
        query['rating'] = mean_rating

    else:
        # Predict the average for this user.
        query['rating'] = float(user['total']) / user['count']

# Write the prediction file.
util.write_predictions(test_queries, pred_filename)
Ejemplo n.º 52
0
"""
# Visualize train fit
from pandas.tools.plotting import scatter_matrix
from pandas import DataFrame
df = DataFrame(np.concatenate((y_test[:,np.newaxis], y_hat[:,np.newaxis]), axis=1))
scatter_matrix(df, alpha=0.2, figsize=(15, 15), diagonal='kde')

print mean_absolute_error(np.exp(y_test), np.exp(y_hat))
"""

#Output predictions
y_out = np.exp(y_hat)
import util
test_ids = pickle.load(open('test_ids.p','rb'))
util.write_predictions(y_out, test_ids, outfile)





"""

##################################
##
## Model Selection
##
##################################

### OLS Train Scores
rss = 0
Ejemplo n.º 53
0
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "predictions3062019.csv"  # feel free to change this or take it as an argument

    # TODO put the names of the feature functions you've defined above in this list
    ffs = [
        system_call_termination_reason, system_call_bigrams,
        system_call_trigrams
    ]
    # ffs = [first_last_system_call_feats, system_call_count_feats, system_call_termination_reason, system_call_count_feat_types, system_call_processes, system_call_unsuccessful]
    # ffs = [system_call_termination_reason]
    # extract features

    first = True

    if first:
        print "extracting training features..."
        X_train, global_feat_dict, t_train, train_ids = extract_feats(
            ffs, train_dir)
        # print "\n\n\n"
        # print len(global_feat_dict)
        print "done extracting training features"
        print
        # X_train, X_test, t_train, t_test = train_test_split(X_train, t_train, random_state = 1)
        # pickle.dump( (X_train, X_test, t_train, t_test, global_feat_dict,t_train,train_ids), open( "save.p", "wb" ) )
    else:
        X_train, X_test, t_train, t_test, global_feat_dict, t_train, train_ids = pickle.load(
            open("save.p", "rb"))
        print(len(global_feat_dict))

    # TODO train here, and learn your classification parameters
    print "learning..."
    print(X_train.shape)

    # for n in [10, 100, 1000]:
    #     clf = RandomForestClassifier(n_estimators = 100)
    #     clf.fit(X_train,t_train)
    #     print "done learning"
    #     print n
    #     print

    #     print "score"
    #     print clf.score(X_test, t_test)

    import xgboost

    clf = RandomForestClassifier(n_estimators=100000)
    # clf = LinearSVC()
    # clf = MLPClassifier(hidden_layer_sizes = (80, 80), max_iter=5000, random_state = 1, alpha=0.05)

    # clf = GaussianNB()
    clf.fit((X_train), t_train)
    print "done learning"
    print

    # print "score"
    # print clf.score(X_test, t_test)

    # get rid of training data and load test data
    del X_train
    del t_train
    del train_ids
    print "extracting test features..."
    X_test, _, t_ignore, test_ids = extract_feats(
        ffs, test_dir, global_feat_dict=global_feat_dict)
    print "done extracting test features"
    print

    # TODO make predictions on text data and write them out
    print "making predictions..."
    preds = clf.predict((X_test))
    print "done making predictions"
    print

    print "writing predictions..."
    util.write_predictions(preds, test_ids, outputfile)
    print "done!"
Ejemplo n.º 54
0
for datum in test_data:
    '''cluster = clusters[users[datum['user']]['index']]
    sum_ratings = 0.
    num_ratings = 0
    for (u,r) in train_sorted[b_keys[datum['isbn']]]:
        if (u in cluster):
            sum_ratings += r
            num_ratings += 1
    if (num_ratings == 0):
        sum_errors += math.pow(mean_rating - datum['rating'],2)
    else:
        sum_errors += math.pow((sum_ratings/num_ratings)-datum['rating'],2)'''
    sum_errors += math.pow(cluster_avgs[users[datum['user']]['index']] - datum['rating'], 2)
print math.sqrt(sum_errors / len(test_data))

'''for query in test_queries:
    cluster = clusters[users[query['user']]]
    sum_ratings = 0.
    num_ratings = 0
    for (u,r) in train_sorted[b_keys[query['isbn']]]:
        if (u in cluster):
            sum_ratings += r
            num_ratings += 1
    if (num_ratings == 0):
        query['rating'] = mean_rating
    else:
        query['rating'] = sum_ratings / num_ratings

# Write the prediction file.
util.write_predictions(test_queries, pred_filename)'''
# format test data
test_ids = df_test.Id.values
df_test = df_test.drop(['Id'], axis=1)
X_test = df_test.values

print "Train features:", X_train.shape
print "Train class:", Y_train.shape
print "Test features:", X_test.shape


# RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100, max_features='log2')
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)
write_predictions(RF_pred, test_ids, 'predicted_RF.csv')
# print 'RandomForestClassifier', categorization_accuracy('predicted_RF01.csv', 'actual_small.csv')

# 0.891444342226 - n_estimators=100, max_features='None'
# 0.894204231831 - n_estimators=50, max_features='log2'
# 0.897884084637 - n_estimators=100, max_features='log2'
# 0.896964121435 - n_estimators=125, max_features='log2'
# 0.896044158234 - n_estimators=115, max_features='log2'

# # QuadraticDiscriminantAnalysis
# QD = QuadraticDiscriminantAnalysis()
# QD.fit(X_train, Y_train)
# QD_pred = QD.predict(X_test)
# write_predictions(QD_pred, test_ids, 'predicted_QD.csv')
# print 'QuadraticDiscriminantAnalysis', categorization_accuracy('predicted_QD.csv', 'actual_small.csv')
#