def pickle_syscalls(): mat,key,cats,_ = classify.extract_feats([syscalls], 'train') mat = np.asarray(mat.todense()) matrix_train = open('matrix_train', 'wb') pickle.dump((mat,key,cats),matrix_train) test_mat,_,_,ids = classify.extract_feats([syscalls], direc='test', global_feat_dict = key) test_mat = np.asarray(test_mat.todense()) matrix_test = open('matrix_test', 'wb') pickle.dump((test_mat,ids),matrix_test)
def example_structure_plot(): mat,key,cats,_ = classify.extract_feats([structure], 'train') for i in range(mat.shape[0]): color = 'red' if cats[i] == 8: color = 'black' plt.scatter([mat[i,key['num_processes']]], [cats[i]], c=color) plt.show()
def get_stats(key_name): mat,key,cats,ids = classify.extract_feats([structure], 'train') mat_len = mat.shape[0] sums = np.zeros((NUM_MALEWARE)) counts = np.zeros((NUM_MALEWARE)) means = np.zeros((NUM_MALEWARE)) var = np.zeros((NUM_MALEWARE)) for i in range(mat_len): sums[cats[i]] += mat[i,key[key_name]] counts[cats[i]] += 1 for i in range(NUM_MALEWARE): means[i] = sums[i]/counts[i] for i in range(mat_len): var[cats[i]] += (mat[i,key[key_name]] - means[cats[i]])**2 for i in range(NUM_MALEWARE): var[i] = var[i]/counts[i] std = np.zeros((NUM_MALEWARE)) std = map(math.sqrt,var) return (means,var,std)
if __name__ == "__main__": if len(sys.argv) < 2: print "USAGE: create_sqlite_db.py [db_name] [num_data_points]" quit() if len(sys.argv) == 3: classification_starter.TOTAL_NUM_DATA = int(sys.argv[2]) print "Using at most", classification_starter.TOTAL_NUM_DATA, "data points..." dbname = sys.argv[1] if dbname[-3:] != ".db": dbname = dbname + ".db" print "creating database..." conn = sqlite3.connect(dbname) c = conn.cursor() # extract features ffs = features.ALL_FEATURES print "extracting training features..." train_dir = "train" X_train, global_feat_dict, t_train, train_ids = classification_starter.extract_feats( ffs, train_dir) print "creating table..." create_table(conn, c, global_feat_dict) print "writing data..." write_data(conn, c, X_train, global_feat_dict, t_train, train_ids) print "creating index..." create_index(conn, c) print "done!"
def loadData(params, withhold, ffs, trainDir="train", testDir="test"): """ loads the movie data arguments: params : dict with several keys: load : loading mode; either: 'extract' to load from `params['extractFile']`, 'split' to load from `params['splitFile']`, or None to extract features and save to `params['extractFile']` and/or `params['splitFile']`. extractFile : file to load/save extracted features to/from, depending on loading mode splitFile : file to load/save split data to/from, depending on loading mode loadTest : True to load the test data (when withold=0), False to return empty arrays withhold : number of data points to withhold for cross-validation; if 0 and `params['loadTest']` is True, then the test data will be loaded and returned ffs : list of feature functions trainDir : path to training file (train.xml) testDir : path to test cases file returns: X_train ,y_train,train_ids, X_test,y_test,test_ids """ # load data from `params['splitFile']` if params['load'] == 'split': X_train, y_train, train_ids, X_test, y_test, test_ids = unpickle( params['splitFile']) print "loaded %d training fds" % len(train_ids) print "withheld %d fds for testing" % len(test_ids) else: # load data from scratch if params['load'] == None: fds, targets, train_ids = classif.extract_feats_helper( ffs, trainDir) print "loaded %d fds" % len(fds) if params['extractFile'] != None: pickle((fds, targets, train_ids), params['extractFile']) # load data from `params['extractFile']`, but split it anew elif params['load'] == 'extract': fds, targets, train_ids = unpickle(params['extractFile']) print "loaded %d fds" % len(fds) # load the test data from the testcases file if withhold == 0: print "Extracting test data features" X_train, feat_dict = classif.make_design_mat(fds) y_train = np.array(targets) train_ids = [] if params['loadTest']: X_test, _, y_test, test_ids = classif.extract_feats( ffs, testDir, global_feat_dict=feat_dict) else: X_test, y_test, test_ids = np.empty(shape=(0, 0)), np.empty( shape=(0, 0)), [] print "Done" # withhold some of the training data into test data else: fds, targets, train_ids, fdsTest, targetsTest, test_ids = splitData( fds, targets, train_ids, withhold, params['splitMethod']) X_train, feat_dict = classif.make_design_mat(fds) X_test, _ = classif.make_design_mat(fdsTest, feat_dict) y_train = np.array(targets) y_test = np.array(targetsTest) if params['splitFile'] != None: print "Writing split file..." pickle((X_train, y_train, train_ids, X_test, y_test, test_ids), params['splitFile']) print "Done writing split file" return X_train, y_train, train_ids, X_test, y_test, test_ids
def loadData(params, withhold, ffs, trainDir="train", testDir="test"): """ loads the movie data arguments: params : dict with several keys: load : loading mode; either: 'extract' to load from `params['extractFile']`, 'split' to load from `params['splitFile']`, or None to extract features and save to `params['extractFile']` and/or `params['splitFile']`. extractFile : file to load/save extracted features to/from, depending on loading mode splitFile : file to load/save split data to/from, depending on loading mode loadTest : True to load the test data (when withold=0), False to return empty arrays withhold : number of data points to withhold for cross-validation; if 0 and `params['loadTest']` is True, then the test data will be loaded and returned ffs : list of feature functions trainDir : path to training file (train.xml) testDir : path to test cases file returns: X_train ,y_train,train_ids, X_test,y_test,test_ids """ # load data from `params['splitFile']` if params['load']=='split': X_train, y_train, train_ids,X_test,y_test,test_ids = unpickle(params['splitFile']) print "loaded %d training fds" % len(train_ids) print "withheld %d fds for testing" % len(test_ids) else: # load data from scratch if params['load']==None: fds, targets, train_ids = classif.extract_feats_helper(ffs, trainDir) print "loaded %d fds" % len(fds) if params['extractFile'] != None: pickle((fds,targets,train_ids),params['extractFile']) # load data from `params['extractFile']`, but split it anew elif params['load']=='extract': fds,targets,train_ids=unpickle(params['extractFile']) print "loaded %d fds" % len(fds) # load the test data from the testcases file if withhold==0: print "Extracting test data features" X_train,feat_dict = classif.make_design_mat(fds) y_train=np.array(targets) train_ids = [] if params['loadTest']: X_test,_,y_test,test_ids = classif.extract_feats(ffs, testDir, global_feat_dict=feat_dict) else: X_test, y_test, test_ids = np.empty(shape=(0,0)), np.empty(shape=(0,0)), [] print "Done" # withhold some of the training data into test data else: fds, targets, train_ids, fdsTest, targetsTest, test_ids = splitData(fds, targets, train_ids, withhold, params['splitMethod']) X_train,feat_dict = classif.make_design_mat(fds) X_test,_ = classif.make_design_mat(fdsTest, feat_dict) y_train=np.array(targets) y_test=np.array(targetsTest) if params['splitFile'] != None: print "Writing split file..." pickle((X_train, y_train, train_ids,X_test,y_test,test_ids), params['splitFile']) print"Done writing split file" return X_train,y_train,train_ids, X_test,y_test,test_ids
print "USAGE: create_sqlite_db.py [db_name] [num_data_points]" quit() if len(sys.argv) == 3: classification_starter.TOTAL_NUM_DATA = int(sys.argv[2]) print "Using at most",classification_starter.TOTAL_NUM_DATA,"data points..." dbname = sys.argv[1] if dbname[-3:] != ".db": dbname = dbname + ".db" print "creating database..." conn = sqlite3.connect(dbname) c = conn.cursor() # extract features ffs = features.ALL_FEATURES print "extracting training features..." train_dir = "train" X_train,global_feat_dict,t_train,train_ids = classification_starter.extract_feats(ffs, train_dir) print "creating table..." create_table(conn, c, global_feat_dict) print "writing data..." write_data(conn, c, X_train, global_feat_dict, t_train, train_ids) print "creating index..." create_index(conn, c) print "done!"