def load_ember_dataset(): """ Return train and test data from EMBER. :return: (array, array, array, array) """ # Perform feature vectorization only if necessary. try: x_train, y_train, x_test, y_test = ember.read_vectorized_features( constants.EMBER_DATA_DIR, feature_version=1) except: ember.create_vectorized_features(constants.EMBER_DATA_DIR, feature_version=1) x_train, y_train, x_test, y_test = ember.read_vectorized_features( constants.EMBER_DATA_DIR, feature_version=1) x_train = x_train.astype(dtype='float64') x_test = x_test.astype(dtype='float64') # Get rid of unknown labels x_train = x_train[y_train != -1] y_train = y_train[y_train != -1] x_test = x_test[y_test != -1] y_test = y_test[y_test != -1] return x_train, y_train, x_test, y_test
def main(): prog = "train_ember" descr = "Train an ember model from a directory with raw feature files" parser = argparse.ArgumentParser(prog=prog, description=descr) parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features") args = parser.parse_args() if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir): parser.error("{} is not a directory with raw feature files".format(args.datadir)) X_train_path = os.path.join(args.datadir, "X_train.dat") y_train_path = os.path.join(args.datadir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features") ember.create_vectorized_features(args.datadir) X_train_path = os.path.join(args.datadir, "X_train_vboat.dat") y_train_path = os.path.join(args.datadir, "y_train_vboat.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features for vboat") print(ember.__file__) ember.create_vectorized_features_vboat(args.datadir) print("Training LightGBM model") lgbm_model = ember.train_model_vboat(args.datadir, 50) lgbm_model.save_model(os.path.join(args.datadir, "model_vboat.txt"))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-d", "--datadir", help="Features Directory", type=str) parser.add_argument("-o", "--output", help="output Directory", type=str) args = parser.parse_args() if not os.path.exists(args.datadir): parser.error("{} is not a directory".format(args.datadir)) if not os.path.exists(args.output): os.mkdir(args.output) #Get total lines from feature.jsonl rows = 0 with jsonlines.open(os.path.join(args.datadir, 'features.jsonl')) as reader: for obj in reader.iter(type=dict, skip_invalid=True): rows += 1 clear(args.datadir) ember.create_vectorized_features(args.datadir, rows) # Train and save model print("Training LightGBM model") lgbm_model = ember.train_model(args.datadir, rows) lgbm_model.save_model(os.path.join(args.output, "model.txt"))
def main(): prog = "train_ember" descr = "Train an ember model from a directory with raw feature files" parser = argparse.ArgumentParser(prog=prog, description=descr) parser.add_argument("-v", "--featureversion", type=int, default=2, help="EMBER feature version") parser.add_argument("-m", "--metadata", action="store_true", help="Create metadata CSVs") parser.add_argument("-t", "--train", action="store_true", help="Train an EMBER model") parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features") parser.add_argument("--optimize", help="gridsearch to find best parameters", action="store_true") args = parser.parse_args() if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir): parser.error("{} is not a directory with raw feature files".format( args.datadir)) X_train_path = os.path.join(args.datadir, "X_train.dat") y_train_path = os.path.join(args.datadir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features") ember.create_vectorized_features(args.datadir, args.featureversion) if args.metadata: ember.create_metadata(args.datadir) if args.train: params = { "boosting": "gbdt", "objective": "binary", "num_iterations": 1000, "learning_rate": 0.05, "num_leaves": 2048, "max_depth": 15, "min_data_in_leaf": 50, "feature_fraction": 0.5 } if args.optimize: params = ember.optimize_model(args.datadir) print("Best parameters: ") print(json.dumps(params, indent=2)) print("Training LightGBM model") lgbm_model = ember.train_model(args.datadir, params, args.featureversion) lgbm_model.save_model(os.path.join(args.datadir, "model.txt"))
def vectorize(self): # To do Error check # if file is jsonl file if self.rows == 0: #logger.info('[Error] Please check if jsonl file is empty ...') return -1 ember.create_vectorized_features(self.jsonlpath, self.output, self.rows, self.features, self.dim)
def createVectorizedFeatures(self): if not os.path.exists(self.outDir) or not os.path.isdir(self.outDir): print("{} is not a directory with raw feature files".format( self.outDir)) X_train_path = os.path.join(self.outDir, "X_train.dat") y_train_path = os.path.join(self.outDir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features") ember.create_vectorized_features(self.outDir) else: print("Vectorized features (.dat files) are already created")
def init_vectorized_features(dataset_dir: str): """ Required for the generation of '.dat' data files :param dataset_dir: directory to the base location of the dataset :return: """ try: assert(os.path.exists(dataset_dir)) ember.create_vectorized_features(dataset_dir, 1) except AssertionError: raise Exception( "[ASSERTION ERROR] The path to base directory of dataset provided does not exist" )
def main(): datadir = '/home/mira/research/dataset/ember.2' if not os.path.exists(datadir) or not os.path.isdir(datadir): print("not a path") X_train_path = os.path.join(datadir, "X_train.dat") y_train_path = os.path.join(datadir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("[{}] Creating vectorized features".format( datetime.datetime.now())) ember.create_vectorized_features(datadir) print("[{}] Training LightGBM model".format(datetime.datetime.now())) lgbm_model = ember.train_model(datadir) lgbm_model.save_model(os.path.join(datadir, "model.txt")) print("[{}] Done".format(datetime.datetime.now()))
# from tensorflow import keras from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import Input, Dense, Dropout from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint from tensorflow.python.keras.utils import to_categorical datadir = './data/ember2017_1/' # In[ ]: # create vectorized features X_train_path = os.path.join(datadir, "X_train.dat") y_train_path = os.path.join(datadir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("[*] Creating vectorized features") ember.create_vectorized_features(datadir, 1) # In[ ]: print("[*] training: read vectorized features") x_train, y_train = ember.read_vectorized_features(datadir, "train", 1) # In[ ]: print("[*] testing: read vectorized features") x_test, y_test = ember.read_vectorized_features(datadir, "test", 1) # In[ ]: train_rows = y_train != -1 print(train_rows.size)
import ember import os import pickle import matplotlib.pyplot as plt from sklearn.metrics import roc_auc_score import numpy as np data_dir = "/home/cuckoo/Desktop/ember/ember2018/" feature_version = 2 if not (os.path.exists(os.path.join( data_dir, f"X_train_{feature_version}.dat")) and os.path.exists( os.path.join(data_dir, f"y_train_{feature_version}.dat"))): print("Creating vectorized features") ember.create_vectorized_features(data_dir, feature_version=feature_version) #_ = ember.create_metadata(data_dir) #emberdf = ember.read_metadata(data_dir) X_test, y_test = ember.read_vectorized_features( data_dir, subset="test", feature_version=feature_version) #X_train, y_train = ember.read_vectorized_features(data_dir, subset="train", feature_version=3) with open(os.path.join(data_dir, f"SGDR_model_{feature_version}.pkl"), 'rb') as f: model = pickle.load(f) y_test_pred = model.predict(X_test) print("ROC AUC:", roc_auc_score(y_test, y_test_pred))
y_train.dat and y_test.dat """ import argparse from sys import argv import ember def parse_arguments(argv): """Parse command line arguments.""" parser = argparse.ArgumentParser() parser.add_argument('--data-dir', dest='data_dir', type=str, default='data', help='Path to data directory.') parser.add_argument('--scale', dest='scale', type=float, default=1., help='Scale of training/test dataset.') return parser.parse_args(argv) # Parse arguments args = parse_arguments(argv[1:]) data_dir = args.data_dir ember.create_vectorized_features(data_dir, scale=args.scale)
def main(): debug = True prog = "train_ember" descr = "Train an ember model from a directory with raw feature files" parser = argparse.ArgumentParser(prog=prog, description=descr) parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features") args = parser.parse_args() # If model data doesn't exist yet, create it from raw features if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir): parser.error("{} is not a directory with raw feature files".format( args.datadir)) X_train_path = os.path.join(args.datadir, "X_train.dat") y_train_path = os.path.join(args.datadir, "y_train.dat") if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features") ember.create_vectorized_features(args.datadir) # Get training and testing data X_train, y_train, X_test, y_test = ember.read_vectorized_features( args.datadir) if debug: print("X_train shape: ", X_train.shape) print("y_train shape: ", y_train.shape) print("X_test shape: ", X_test.shape) print("y_test shape: ", y_test.shape) # Convert memmap to pandas series for metrics y_test = pandas.Series(data=y_test) # Decision Tree Learner from sklearn import tree tree_clf = tree.DecisionTreeClassifier(max_depth=5) """ tree_model_path = os.path.join(args.datadir, "tree_model.p") # Train model if it doesn't exist if not (os.path.exists(tree_model_path)): print("Training model") tree_clf.fit(X_train, y_train) pickle.dump(tree_clf, open("tree_model.p","wb")) saved_tree_clf = pickle.load(open("tree_model.p", "rb")) """ tree_clf.fit(X_train, y_train) tree_dot = tree.export_graphviz(tree_clf, out_file=None) graph = graphviz.Source(tree_dot) graph.render("tree") y_pred = tree_clf.predict(X_test) print("\n##### Metrics #####\n") print("Accuracy Score") print(metrics.accuracy_score(y_test, y_pred), "\n") print("Class distribution\n", y_test.value_counts(), "\n") print("Average Malware: ", y_test.mean()) print("Average Benign: ", 1 - y_test.mean()) print("Null Accuracy: ", max(y_test.mean(), 1 - y_test.mean()), "\n") print("Confusion Matrix") print("[[TN FP]\n [FN TP]]\n") confusion = metrics.confusion_matrix(y_test, y_pred) print(confusion, "\n") TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] print("Accuracy: how often is the classifier right?") print("Accuracy from Confusion Matrix") print("(TP + TN) / float(TP + TN + FP + FN)") print((TP + TN) / float(TP + TN + FP + FN), "\n") print("Classification Error: How often is the classifier wrong?") print("(1 - metrics.accuracy_score(y_test, y_pred)") print((1 - metrics.accuracy_score(y_test, y_pred)), "\n") print( "Sensitivity: When the actual value is positive,\n how often is the prediction right?" ) print("Also call 'recall'") print("TP / float(TP + FN)") print((TP / float(TP + FN)), "\n") print(metrics.recall_score(y_test, y_pred)) print( "Specificity: When value is negative, how often is the prediction right?" ) print("TN / float(TN + FP)") print(TN / float(TN + FP), "\n") print( "False Pos. Rate: When the actual value is negative,\n how often is the prediction wrong?" ) print("FP / float(TN + FP)") print(FP / float(TN + FP), "\n") print( "Precision: When a positive value is predicted,\n how often is the prediction right?" ) print("TP / float(TP + FP)") print(TP / float(TP + FP), "\n") # Classification threshold # MUST USE y_pred_prod with the positive class!!! y_pred_prob = tree_clf.predict_proba(X_test)[:, 1] # ROC: Choose a threshold that balances sensitivity and specificity # Ideal plot hugs top left of graph: high sensitivity and high specificity fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob) plt.plot(fpr, tpr) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.title('ROC curve for malware classifier') plt.xlabel('False Positive Rate (1 - Specificity)') plt.ylabel('True Positive Rate (Sensitivity)') plt.grid(True) plt.show() evaluate_threshold(tpr, fpr, thresholds, 0.5) # AUC: percentage of ROC plot that is under the curve # Higher AUC indicates top left graph ROC # Useful for imbalanced classes print("AUC") print(metrics.roc_auc_score(y_test, y_pred_prob), "\n")
# -*- coding: utf-8 -*- """ Created on Thu Feb 28 12:42:09 2019 @author: Piyush """ import ember ember.create_vectorized_features('D:/ml_projects/datasets/ember') ember.create_metadata('D:/ml_projects/datasets/ember')
import ember import h5py ember_dir = "../data/ember2018/" ember.create_vectorized_features(ember_dir) X_train, y_train, X_test, y_test = ember.read_vectorized_features(ember_dir) with h5py.File("../data/Ember2018.h5", 'w') as f: grp_train = f.create_group("train") grp_train.create_dataset("data", data=X_train.transpose()) grp_train.create_dataset("targets", data=y_train) grp_test = f.create_group("test") grp_test.create_dataset("data", data=X_test.transpose()) grp_test.create_dataset("targets", data=y_test)
def main(): prog = "train_ember" descr = "Train an ember model from a directory with raw feature files" parser = argparse.ArgumentParser(prog=prog, description=descr) parser.add_argument("--modelname", type=str, default="SGD", help="Model name") parser.add_argument("-v", "--featureversion", type=int, default=2, help="EMBER feature version") parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features") parser.add_argument("--optimize", help="gridsearch to find best parameters", action="store_true") args = parser.parse_args() if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir): parser.error("{} is not a directory with raw feature files".format( args.datadir)) X_train_path = os.path.join(args.datadir, f"X_train_{args.featureversion}.dat") y_train_path = os.path.join(args.datadir, f"y_train_{args.featureversion}.dat") # if they don't exist, compute them. if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)): print("Creating vectorized features") ember.create_vectorized_features(args.datadir, args.featureversion) #feature_name = ['feature_' + str(col) for col in range(num_feature)] params = { "boosting": "gbdt", "objective": "regression", "num_iterations": 1000, "learning_rate": 0.05, "num_leaves": 2048, "max_depth": 15, "min_data_in_leaf": 50, "feature_fraction": 0.5, "num_threads": 2, } if args.optimize: params = ember.optimize_model(args.datadir) print("Best parameters: ") print(json.dumps(params, indent=2)) print("Training Classifier model") lgbm_model = ember.train_model(args.datadir, params, args.featureversion) # Save to file in the current working directory #pkl_filename = os.path.join(args.datadir,f"{args.modelname}_model_{args.featureversion}.pkl") # with open(pkl_filename, 'wb') as f: # pickle.dump(lgbm_model, f) print(f"file dumped into model.txt .... ") lgbm_model.save_model( os.path.join(args.datadir, f"model_{args.featureversion}.txt")) print('Plotting feature importances...') ax = lgb.plot_importance(lgbm_model, max_num_features=10) plt.savefig(f'lgbm_importances-0{args.featureversion}.png') # run os.system(f"xdg-open lgbm_importances-0{args.featureversion}.png")
import ember # ember hard codes the size of data. # it's best to just let it convert all 1.1M points. This only takes like 30 minutes # only have to do this once. # data points are stored in the data directory and are mmaped into memory when training ember.create_vectorized_features("./data/ember/") ember.create_metadata("./data/ember/")
def create_vectorized_ember(): ember.create_vectorized_features("./data/ember2018/") ember.create_metadata("./data/ember2018/")