x = labs - 1 plt.xticks(x, labs) plt.yticks(x, labs) for i in x: for j in x: ax.text(i - 0.2, j + 0.2, "{:3.0f}".format(norm_conf_mat[j, i] * 100.)) return conf_mat sl = SKSupervisedLearning(SVC, X, Y_train, Xt, Y_test) sl.fit_standard_scaler() sl.train_params = {'C': 100, 'gamma': 0.01, 'probability': True} ll_trn, ll_tst = sl.fit_and_validate() print("SVC log loss: ", ll_tst) conf_svm = plot_confusion(sl) #Neural net trndata, tstdata = createDataSets(sl.X_train_scaled, Y_train, sl.X_test_scaled, Y_test) fnn = train(trndata, tstdata, epochs=1000, test_error=0.025, momentum=0.15, weight_decay=0.0001)
from sklearn.lda import LDA from sklearn.preprocessing import normalize tf = TrainFiles('/kaggle/malware/scratchpad/text/train/instr_freq', '/kaggle/malware/scratchpad/text/test/instr_freq', "/kaggle/malware/trainLabels.csv") tf1 = TrainFiles('/kaggle/malware/scratchpad/train/1dlbp', '/kaggle/malware/scratchpad/test/1dlbp', "/kaggle/malware/trainLabels.csv") X_train, Y_train, X_test, Y_test = tf1.prepare_inputs() n_components = 300 pca = PCA(n_components = n_components) pca.fit(np.r_[X_train, X_test]) #n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.99)[0][0] #print n_components #pca = PCA(n_components = n_components) #pca.fit(np.r_[X_train, X_test]) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Naive Bayes sl = SKSupervisedLearning(LDA, X_train, Y_train, X_test, Y_test) #sl.fit_standard_scaler() trndata, tstdata = createDataSets(normalize(X_train), Y_train, normalize(X_test), Y_test) train(trndata, tstdata, epochs = 1000, weight_decay = 0.0001, momentum = 0.15) ll = sl.fit_and_validate() print "Log loss: ", ll
def predict(): tf = TrainFiles('/kaggle/malware/train/mix_lbp', val_path = '/kaggle/malware/test/mix_lbp', labels_file = "/kaggle/malware/trainLabels.csv") X_train, Y_train, X_test, Y_test = tf.prepare_inputs() sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test) sl_svm.fit_standard_scaler() sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True} print "Starting SVM: ", time_now_str() _, ll_svm = sl_svm.fit_and_validate() print "SVM score: {0:.4f}".format(ll_svm if not prediction else _) print "Finished training SVM: ", time_now_str() # neural net print "Starting NN: ", time_now_str() trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based = True) tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled) fnn = predict_nn(trndata) proba_nn = fnn.activateOnDataset(tstdata) print "Finished training NN: ", time_now_str() # no validation labels on actual prediction if doTrees: # random forest sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train, Y_train, X_test, Y_test) sl_ccrf.train_params = \ {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10} sl_ccrf.fit_standard_scaler() print "Starting on RF: ", time_now_str() ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate() print "RF score: {0:.4f}".format(ll_ccrf_tst if not prediction else ll_ccrf_trn) sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob") sl_svm.proba_test.tofile("/temp/sl_svm.prob") proba_nn.tofile("/temp/nn.prob") print "Finished training RF: ", time_now_str() if prediction: proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn], [2./3., 1./6., 1./3.]) out_labels = "/kaggle/malware/submission33.csv" task_labels = "/kaggle/malware/testLabels.csv" labels = [path.splitext(t)[0] for t in tf.get_val_inputs()] out = write_to_csv(task_labels, labels, proba, out_labels) else: # visualize the decision surface, projected down to the first # two principal components of the dataset pca = PCA(n_components=2).fit(sl_svm.X_train_scaled) X = pca.transform(sl_svm.X_train_scaled) x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1) y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1) xx, yy = np.meshgrid(x, y) # title for the plots titles = ['SVC with rbf kernel', 'Random Forest \n' 'n_components=7500', 'Decision Trees \n' 'n_components=7500'] #plt.tight_layout() plt.figure(figsize=(12, 5)) # predict and plot for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)): # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. plt.subplot(1, 3, i + 1) clf.fit(X, Y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.axis('off') # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired) plt.title(titles[i]) plt.tight_layout() plt.show()
cb = fig.colorbar(res) labs = np.unique(Y_test) x = labs - 1 plt.xticks(x, labs) plt.yticks(x, labs) for i in x: for j in x: ax.text(i - 0.2, j + 0.2, "{:3.0f}".format(norm_conf_mat[j, i] * 100.)) return conf_mat sl = SKSupervisedLearning(SVC, X, Y_train, Xt, Y_test) sl.fit_standard_scaler() sl.train_params = {'C': 100, 'gamma': 0.01, 'probability' : True} ll_trn, ll_tst = sl.fit_and_validate() print "SVC log loss: ", ll_tst conf_svm = plot_confusion(sl) #Neural net trndata, tstdata = createDataSets(sl.X_train_scaled, Y_train, sl.X_test_scaled, Y_test) fnn = train(trndata, tstdata, epochs = 1000, test_error = 0.025, momentum = 0.15, weight_decay = 0.0001) sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X, Y_train, Xt, Y_test) sl_ccrf.train_params = \ {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10} sl_ccrf.fit_standard_scaler() ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate()
def predict(): tf = TrainFiles('/kaggle/malware/train/mix_lbp', val_path='/kaggle/malware/test/mix_lbp', labels_file="/kaggle/malware/trainLabels.csv") X_train, Y_train, X_test, Y_test = tf.prepare_inputs() sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test) sl_svm.fit_standard_scaler() sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True} print "Starting SVM: ", time_now_str() _, ll_svm = sl_svm.fit_and_validate() print "SVM score: {0:.4f}".format(ll_svm if not prediction else _) print "Finished training SVM: ", time_now_str() # neural net print "Starting NN: ", time_now_str() trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based=True) tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled) fnn = predict_nn(trndata) proba_nn = fnn.activateOnDataset(tstdata) print "Finished training NN: ", time_now_str() # no validation labels on actual prediction if doTrees: # random forest sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train, Y_train, X_test, Y_test) sl_ccrf.train_params = \ {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10} sl_ccrf.fit_standard_scaler() print "Starting on RF: ", time_now_str() ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate() print "RF score: {0:.4f}".format( ll_ccrf_tst if not prediction else ll_ccrf_trn) sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob") sl_svm.proba_test.tofile("/temp/sl_svm.prob") proba_nn.tofile("/temp/nn.prob") print "Finished training RF: ", time_now_str() if prediction: proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn], [2. / 3., 1. / 6., 1. / 3.]) out_labels = "/kaggle/malware/submission33.csv" task_labels = "/kaggle/malware/testLabels.csv" labels = [path.splitext(t)[0] for t in tf.get_val_inputs()] out = write_to_csv(task_labels, labels, proba, out_labels) else: # visualize the decision surface, projected down to the first # two principal components of the dataset pca = PCA(n_components=2).fit(sl_svm.X_train_scaled) X = pca.transform(sl_svm.X_train_scaled) x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1) y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1) xx, yy = np.meshgrid(x, y) # title for the plots titles = [ 'SVC with rbf kernel', 'Random Forest \n' 'n_components=7500', 'Decision Trees \n' 'n_components=7500' ] #plt.tight_layout() plt.figure(figsize=(12, 5)) # predict and plot for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)): # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. plt.subplot(1, 3, i + 1) clf.fit(X, Y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.axis('off') # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired) plt.title(titles[i]) plt.tight_layout() plt.show()
'/kaggle/malware/scratchpad/text/test/instr_freq', "/kaggle/malware/trainLabels.csv") tf1 = TrainFiles('/kaggle/malware/scratchpad/train/1dlbp', '/kaggle/malware/scratchpad/test/1dlbp', "/kaggle/malware/trainLabels.csv") X_train, Y_train, X_test, Y_test = tf1.prepare_inputs() n_components = 300 pca = PCA(n_components=n_components) pca.fit(np.r_[X_train, X_test]) #n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.99)[0][0] #print n_components #pca = PCA(n_components = n_components) #pca.fit(np.r_[X_train, X_test]) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Naive Bayes sl = SKSupervisedLearning(LDA, X_train, Y_train, X_test, Y_test) #sl.fit_standard_scaler() trndata, tstdata = createDataSets(normalize(X_train), Y_train, normalize(X_test), Y_test) train(trndata, tstdata, epochs=1000, weight_decay=0.0001, momentum=0.15) ll = sl.fit_and_validate() print "Log loss: ", ll