Esempio n. 1
0
def write_canned_predictions(tf, task_labels, out_labels):
    proba_svm = np.fromfile('/kaggle/malware/results/sl_svm.prob').reshape(-1, 9, order = 'C')
    proba_rf = np.fromfile('/kaggle/malware/results/sl_ccrf.prob').reshape(-1, 9, order = 'C')

    proba = vote([proba_svm, proba_rf], [4./5., 1./3.])

    labels = [path.splitext(t)[0] for t in tf.get_val_inputs()]
    out = write_to_csv(task_labels, labels, proba, out_labels)
Esempio n. 2
0
def write_canned_predictions(tf, task_labels, out_labels):
    proba_svm = np.fromfile('/kaggle/malware/results/sl_svm.prob').reshape(-1, 9, order = 'C')
    proba_rf = np.fromfile('/kaggle/malware/results/sl_ccrf.prob').reshape(-1, 9, order = 'C')

    proba = vote([proba_svm, proba_rf], [4./5., 1./3.])

    labels = [path.splitext(t)[0] for t in tf.get_val_inputs()]
    out = write_to_csv(task_labels, labels, proba, out_labels)
Esempio n. 3
0
#ax.set_ylabel('Predicted')
#ax.xticks(np.unique(Y_test))
#ax.yticks(np.unique(Y_test))

#fig.show()

x = 1. / np.arange(1., 6)
y = 1 - x

xx, yy = np.meshgrid(x, y)
lls1 = np.zeros(xx.shape[0] * yy.shape[0]).reshape(xx.shape[0], yy.shape[0])
lls2 = np.zeros(xx.shape[0] * yy.shape[0]).reshape(xx.shape[0], yy.shape[0])

for i, x_ in enumerate(x):
    for j, y_ in enumerate(y):
        proba = vote([sl.proba_test, sl_ccrf.proba_test], [x_, y_])
        lls1[i, j] = log_loss(Y_test, proba)

        proba = vote([sl.proba_test, sl_ccrf.proba_test], [y_, x_])
        lls2[i, j] = log_loss(Y_test, proba)

fig = plt.figure()
plt.clf()
ax = fig.add_subplot(121)
ax1 = fig.add_subplot(122)

ax.set_aspect(1)
ax1.set_aspect(1)

res = ax.imshow(np.array(lls1), cmap=plt.cm.jet, interpolation='nearest')
res = ax1.imshow(np.array(lls2), cmap=plt.cm.jet, interpolation='nearest')
Esempio n. 4
0
def predict():
    tf = TrainFiles('/kaggle/malware/train/mix_lbp', val_path = '/kaggle/malware/test/mix_lbp', labels_file = "/kaggle/malware/trainLabels.csv")

    X_train, Y_train, X_test, Y_test = tf.prepare_inputs()

    sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test)
    sl_svm.fit_standard_scaler()
    sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True}

    print "Starting SVM: ", time_now_str()
    _, ll_svm = sl_svm.fit_and_validate()

    print "SVM score: {0:.4f}".format(ll_svm if not prediction else _)
    print "Finished training SVM: ", time_now_str()

    # neural net
    print "Starting NN: ", time_now_str()

    trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based = True)
    tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled)
    fnn = predict_nn(trndata)
    proba_nn = fnn.activateOnDataset(tstdata)

    print "Finished training NN: ", time_now_str()

    # no validation labels on actual prediction
    if doTrees:
        # random forest
        sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train, Y_train, X_test, Y_test)
        sl_ccrf.train_params = \
            {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10}
        sl_ccrf.fit_standard_scaler()

        print "Starting on RF: ", time_now_str()
        ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate()

        print "RF score: {0:.4f}".format(ll_ccrf_tst if not prediction else ll_ccrf_trn)
        sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob")
        sl_svm.proba_test.tofile("/temp/sl_svm.prob")
        proba_nn.tofile("/temp/nn.prob")

        print "Finished training RF: ", time_now_str()

    if prediction:
        proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn], [2./3., 1./6., 1./3.])

        out_labels = "/kaggle/malware/submission33.csv"
        task_labels = "/kaggle/malware/testLabels.csv"
        labels = [path.splitext(t)[0] for t in tf.get_val_inputs()]
        out = write_to_csv(task_labels, labels, proba, out_labels)

    else:
        # visualize the decision surface, projected down to the first
        # two principal components of the dataset
        pca = PCA(n_components=2).fit(sl_svm.X_train_scaled)

        X = pca.transform(sl_svm.X_train_scaled)

        x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1)
        y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1)

        xx, yy = np.meshgrid(x, y)

        # title for the plots
        titles = ['SVC with rbf kernel',
                  'Random Forest \n'
                  'n_components=7500',
                  'Decision Trees \n'
                  'n_components=7500']

        #plt.tight_layout()
        plt.figure(figsize=(12, 5))

        # predict and plot
        for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)):
            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, m_max]x[y_min, y_max].
            plt.subplot(1, 3, i + 1)
            clf.fit(X, Y_train)
            Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
            plt.axis('off')

            # Plot also the training points
            plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired)

            plt.title(titles[i])
        plt.tight_layout()
        plt.show()
#ax.set_ylabel('Predicted')
#ax.xticks(np.unique(Y_test))
#ax.yticks(np.unique(Y_test))

#fig.show()

x = 1. / np.arange(1., 6)
y = 1 - x

xx, yy = np.meshgrid(x, y)
lls1 = np.zeros(xx.shape[0] * yy.shape[0]).reshape(xx.shape[0], yy.shape[0])
lls2 = np.zeros(xx.shape[0] * yy.shape[0]).reshape(xx.shape[0], yy.shape[0])

for i, x_ in enumerate(x):
    for j, y_ in enumerate(y):
        proba = vote([sl.proba_test, sl_ccrf.proba_test], [x_, y_])
        lls1[i, j] = log_loss(Y_test, proba)

        proba = vote([sl.proba_test, sl_ccrf.proba_test], [y_, x_])
        lls2[i, j] = log_loss(Y_test, proba)

fig = plt.figure()
plt.clf()
ax = fig.add_subplot(121)
ax1 = fig.add_subplot(122)

ax.set_aspect(1)
ax1.set_aspect(1)

res = ax.imshow(np.array(lls1), cmap=plt.cm.jet, 
                interpolation='nearest')
Esempio n. 6
0
def predict():
    tf = TrainFiles('/kaggle/malware/train/mix_lbp',
                    val_path='/kaggle/malware/test/mix_lbp',
                    labels_file="/kaggle/malware/trainLabels.csv")

    X_train, Y_train, X_test, Y_test = tf.prepare_inputs()

    sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test)
    sl_svm.fit_standard_scaler()
    sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True}

    print "Starting SVM: ", time_now_str()
    _, ll_svm = sl_svm.fit_and_validate()

    print "SVM score: {0:.4f}".format(ll_svm if not prediction else _)
    print "Finished training SVM: ", time_now_str()

    # neural net
    print "Starting NN: ", time_now_str()

    trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based=True)
    tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled)
    fnn = predict_nn(trndata)
    proba_nn = fnn.activateOnDataset(tstdata)

    print "Finished training NN: ", time_now_str()

    # no validation labels on actual prediction
    if doTrees:
        # random forest
        sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train,
                                       Y_train, X_test, Y_test)
        sl_ccrf.train_params = \
            {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10}
        sl_ccrf.fit_standard_scaler()

        print "Starting on RF: ", time_now_str()
        ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate()

        print "RF score: {0:.4f}".format(
            ll_ccrf_tst if not prediction else ll_ccrf_trn)
        sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob")
        sl_svm.proba_test.tofile("/temp/sl_svm.prob")
        proba_nn.tofile("/temp/nn.prob")

        print "Finished training RF: ", time_now_str()

    if prediction:
        proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn],
                     [2. / 3., 1. / 6., 1. / 3.])

        out_labels = "/kaggle/malware/submission33.csv"
        task_labels = "/kaggle/malware/testLabels.csv"
        labels = [path.splitext(t)[0] for t in tf.get_val_inputs()]
        out = write_to_csv(task_labels, labels, proba, out_labels)

    else:
        # visualize the decision surface, projected down to the first
        # two principal components of the dataset
        pca = PCA(n_components=2).fit(sl_svm.X_train_scaled)

        X = pca.transform(sl_svm.X_train_scaled)

        x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1)
        y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1)

        xx, yy = np.meshgrid(x, y)

        # title for the plots
        titles = [
            'SVC with rbf kernel', 'Random Forest \n'
            'n_components=7500', 'Decision Trees \n'
            'n_components=7500'
        ]

        #plt.tight_layout()
        plt.figure(figsize=(12, 5))

        # predict and plot
        for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)):
            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, m_max]x[y_min, y_max].
            plt.subplot(1, 3, i + 1)
            clf.fit(X, Y_train)
            Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
            plt.axis('off')

            # Plot also the training points
            plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired)

            plt.title(titles[i])
        plt.tight_layout()
        plt.show()