def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0] X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0] unique_labels = sorted(y_train.unique().tolist()) print(X_train) print(X_test) # hyper-parameters inferred by running skopt clf = SVC(C=447.81051228628013, coef0=0.12426850569436687, decision_function_shape="ovr", degree=2, gamma=0.02413100813767344, kernel="rbf", tol=0.004948161298923479, verbose=True) clf.fit(X_train, y_train) print("\n\n{}\n".format(clf.score(X_test, y_test))) y_predicted = clf.predict(X_test) print("Generating confusion matrix figure... \n") stdfunc.plot_confusion_matrix(y_test, y_predicted, ml_name='SVM', classes=unique_labels, title='Confusion matrix for SVM evaluation') print("Generating classification report figure... \n") stdfunc.plot_classification_report(y_test, y_predicted, ml_name='SVM', classes=unique_labels, title='Classification report for SVM evaluation')
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0] X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0] unique_labels = sorted(y_train.unique().tolist()) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) print("\n\n{}\n".format(clf.score(X_test, y_test))) y_predicted = clf.predict(X_test) print("Generating confusion matrix figure... \n") stdfunc.plot_confusion_matrix( y_test, y_predicted, ml_name='DT', classes=unique_labels, title='Confusion matrix for Decision Tree evaluation') print("Generating classification report figure... \n") stdfunc.plot_classification_report( y_test, y_predicted, ml_name='DT', classes=unique_labels, title='Classification report for Decision Tree evaluation')
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0] X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0] unique_labels = sorted(y_train.unique().tolist()) # hyper-parameters inferred by running auto-sklearn clf = GradientBoostingClassifier(learning_rate=0.0433556140045585, n_estimators=388, subsample=0.8291104221904706, criterion='mse', min_samples_split=13, min_samples_leaf=15, max_depth=10, max_features=0.33000096635982235, verbose=True) # hyper parameters inferred by running hyperopt-sklearn # clf = GradientBoostingClassifier(criterion="mse", learning_rate=0.28539836866041823, max_depth=9, max_features=0.3842196341383438, # min_samples_leaf=14, min_samples_split=9, n_estimators=734, subsample=0.7421091918485163) clf.fit(X_train, y_train) print("\n\n{}\n".format(clf.score(X_test, y_test))) y_predicted = clf.predict(X_test) print("Generating confusion matrix figure... \n") stdfunc.plot_confusion_matrix(y_test, y_predicted, ml_name='DT', classes=unique_labels, title='Confusion matrix for Decision Tree evaluation') print("Generating classification report figure... \n") stdfunc.plot_classification_report(y_test, y_predicted, ml_name='DT', classes=unique_labels, title='Classification report for Decision Tree evaluation')
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0] X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0] unique_labels = sorted(y_train.unique().tolist()) clf = GaussianProcessClassifier(max_iter_predict=500, warm_start=True, n_jobs=-1) clf.fit(X_train, y_train) print("\n\n{}\n".format(clf.score(X_test, y_test))) y_predicted = clf.predict(X_test) print("Generating confusion matrix figure... \n") stdfunc.plot_confusion_matrix(y_test, y_predicted, ml_name='GP', classes=unique_labels, title='Confusion matrix for Gaussian Process evaluation') print("Generating classification report figure... \n") stdfunc.plot_classification_report(y_test, y_predicted, ml_name='GP', classes=unique_labels, title='Classification report for Gaussian Process evaluation')
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0] X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0] unique_labels = sorted(y_train.unique().tolist()) # hyper-parameters inferred from running hyperopt-sklearn clf = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy', max_depth=None, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=1, oob_score=False, random_state=1, verbose=False, warm_start=False) clf.fit(X_train, y_train) print("\n\n{}\n".format(clf.score(X_test, y_test))) y_predicted = clf.predict(X_test) print("Generating confusion matrix figure... \n") stdfunc.plot_confusion_matrix( y_test, y_predicted, ml_name='RF', classes=unique_labels, title='Confusion matrix for Random Forest evaluation') print("Generating classification report figure... \n") stdfunc.plot_classification_report( y_test, y_predicted, ml_name='RF', classes=unique_labels, title='Classification report for Random Forest evaluation')
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values unique_labels = sorted(set(y_train.tolist())) le = preprocessing.LabelEncoder() le.fit(y_train) y_train = le.transform(y_train) y_test = le.transform(y_test) # hyper-parameters inferred by running hyperopt-sklearn clf = XGBClassifier(colsample_bylevel=0.8737745469231419, colsample_bytree=1.0, gamma=4.858229599937319e-07, learning_rate=0.4853267733199465, max_delta_step=0, max_depth=9, min_child_weight=0, n_estimators=64, reg_alpha=2.5693931492543614e-05, reg_lambda=6.027978487395207e-05, scale_pos_weight=73.0915750362818, subsample=0.5410531887103683) clf.fit(X_train, y_train) print("\n\n{}\n".format(clf.score(X_test, y_test))) y_predicted = clf.predict(X_test) print("Generating confusion matrix figure... \n") stdfunc.plot_confusion_matrix( y_test, y_predicted, ml_name='XG', classes=unique_labels, title='Confusion matrix for XGBoost evaluation') print("Generating classification report figure... \n") stdfunc.plot_classification_report( y_test, y_predicted, ml_name='XG', classes=unique_labels, title='Classification report for XGBoost evaluation')
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0] X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0] unique_labels = sorted(y_train.unique().tolist()) curr_pred, curr_score = None, 0 for k in range(1, 20): clf = KNeighborsClassifier(n_neighbors=k, n_jobs=-1) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) y_predicted = clf.predict(X_test) if score > curr_score: print("K = {} -- {}".format(k, score)) curr_pred = y_predicted curr_score = score print("Generating confusion matrix figure... \n") stdfunc.plot_confusion_matrix( y_test, curr_pred, ml_name='Nearest_Neighbor', classes=unique_labels, title='Confusion matrix for Nearest Neighbor evaluation') print("Generating classification report figure... \n") stdfunc.plot_classification_report( y_test, curr_pred, ml_name='Nearest_Neighbor', classes=unique_labels, title='Classification report for Nearest Neighbor evaluation')
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("\nDevice used : {}".format( 'cuda' if torch.cuda.is_available() else 'cpu')) print("Pytorch version: {}".format(torch.__version__)) if torch.cuda.is_available(): print(torch.cuda.get_device_name(0)) project_name = "MLP" # hyper parameters num_epochs = 30 # how many iterations for complete single dataset training learning_rate = 0.003 batch_size = 3 # batch per-training layer_size = [15, 50, 100, 30, 10, 2] enable_checkpoint = True # model filename checkpoint_name = 'checkpoint-{}.pt'.format(project_name) # load dataset malware_train = LoadDataset(encoded_features_path='../train_dataset.csv') malware_test = LoadDataset(encoded_features_path='../test_dataset.csv') print("\nSize of training dataset: {}".format(len(malware_train))) print("Size of testing dataset: {}\n".format(len(malware_test))) # shuffle=True means for every epoch, the data is going to be re-shuffled # pin_memory=True, ref: https://devblogs.nvidia.com/how-optimize-data-transfers-cuda-cc/ train_loader = torch.utils.data.DataLoader( malware_train, batch_size=batch_size, pin_memory=True, shuffle=True) test_loader = torch.utils.data.DataLoader( malware_test, batch_size=batch_size, pin_memory=True, shuffle=False) # setup appropriate objects mlp = MultilayerPerceptron(layer_size).to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(mlp.parameters(), lr=learning_rate) epoch = 0 # load previous checkpoint if it exists if enable_checkpoint and os.path.exists(checkpoint_name): print("Previous checkpoint model found!\n") checkpoint = torch.load(checkpoint_name) mlp.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] mlp.eval() while epoch < num_epochs: avg_loss = 0 for i, (X, _, labels) in enumerate(train_loader): mlp.train() # switch back to train mode X, labels = X.to(device), labels.to(device) outputs = mlp(X) loss = criterion(outputs, labels) avg_loss += loss.item() optimizer.zero_grad() # clear our previous calc loss.backward() # calc all parameters gradient optimizer.step() # apply weight tuning based on calculated gradient if (i+1) % 30 == 0: mlp.eval() # turns off dropout and batch normalization epoch_fmt = str(epoch).rjust(len(str(num_epochs))) batch_fmt = str(i+1).rjust(len(str(len(train_loader)))) fmt_str = "Epochs [" + epoch_fmt + "/{}], Batch [" + batch_fmt + "/{}], Loss = {:.6f}" print(fmt_str.format(num_epochs, len(train_loader), loss.item())) avg_loss /= len(train_loader) if (epoch+1) % 5 == 0: print("\nAverage loss for epochs [{}] = {:.8f}\n".format(epoch+1, avg_loss)) # test accuracy of model for every 10 epochs if (epoch+1) % 10 == 0: with torch.no_grad(): # turns off dropout and batch normalization mlp.eval() correct_cnt, total_cnt = 0, 0 for X, mal_hash, labels in test_loader: X, labels = X.float().to(device), labels.to(device) outputs = mlp(X) max_accuracy, pred_label = torch.max(outputs.data, 1) ''' for each_accuracy, each_hash in zip(max_accuracy, mal_hash): if each_accuracy <= 0.94: print("Low accuracy {:.6f} = {}\n".format(each_accuracy, each_hash)) ''' total_cnt += X.cpu().data.size()[0] correct_cnt += (pred_label == labels.data).sum() accuracy = correct_cnt.cpu().item() * 1.0 / total_cnt print("Test - Epoch {} -- Accuracy : {}\n".format(epoch+1, accuracy)) # save model for every 10 iterations -- make sure we don't lost everything if enable_checkpoint: if (epoch+1) % 10 == 0: print("Saving checkpoint model..\n") torch.save({ 'epoch': epoch+1, 'model_state_dict': mlp.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, checkpoint_name) epoch += 1 torch.save(mlp.state_dict(), '{}-Trained-Model.pt'.format(project_name)) mlp.eval() predict_list, label_list = [], [] with torch.no_grad(): correct_cnt, total_cnt = 0, 0 for X, labels in test_loader: X, labels = X.float().to(device), labels.to(device) outputs = mlp(X) _, pred_label = torch.max(outputs.data, 1) predict_list.extend(pred_label.cpu().numpy().tolist()) label_list.extend(labels.cpu().numpy().tolist()) total_cnt += X.cpu().data.size()[0] correct_cnt += (pred_label == labels.data).sum() accuracy = correct_cnt.cpu().item() * 1.0 / total_cnt print("Final Accuracy = {}\n".format(accuracy)) with open('malware-label-index.txt', 'w') as fo: fo.write('[' + ','.join(malware_train.unique_labels) + ']') print("Generating confusion matrix figure... \n") stdfunc.plot_confusion_matrix(label_list, predict_list, ml_name='MLP', classes=malware_train.unique_labels, title='Confusion matrix for MLP evaluation') print("Generating clustering report figure... \n") stdfunc.plot_clustering_report(label_list, predict_list, ml_name='MLP', classes=malware_train.unique_labels, title='Clustering report for MLP evaluation')