def run_open_experiment(self, iterations=10): """ Train a classifier on test data, obtain the best combination of paramters through a grid search cross-validation and test the classifier using a open-world split of the dataset. The results from the number of iterations are saved as pz files. """ self.true_labels = np.array([]) self.predictions = np.array([]) for i in xrange(iterations): self.randomize_dataset_open_world() clf = GridSearchCV(svm.LinearSVC(), {'C':np.logspace(-3,3,7)}) clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.decision_function(self.X_test) classes = clf.best_estimator_.classes_ for scores in out: m = np.max(scores) if (abs(m/scores[:][:]) < 0.5).any(): self.predictions = np.append(self.predictions, 99) else: p = classes[np.where(scores==m)] self.predictions = np.append(self.predictions, p) self.true_labels = np.append(self.true_labels, self.Y_test) pz.save(self.predictions, "mca_predictions_open.pz") pz.save(self.true_labels, "mca_true_labels_open.pz")
def nh_kernel_matrix(graph_set, R=1): """ compute the kernel matrix of a set of graphs using the NHK and label comparison """ N = len(graph_set) K_set = [] computation_size = R * (N ** 2 - sum(range(N + 1))) print "Total number of graphs: {0}".format(N) print "Total number of graph comparisons: {0}".format(computation_size) for r in xrange(R): # compute neighbor hash for nodes in every graph print "Starting iteration {0}...".format(r) widgets = ["Computing NH: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " "] pbar = ProgressBar(widgets=widgets, maxval=N) pbar.start() progress = 0 for i in xrange(N): graph_set[i] = neighborhood_hash(graph_set[i]) progress += 1 pbar.update(progress) pbar.finish() # precompute the label histogram for each graph widgets = ["Computing Label Hist: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " "] pbar = ProgressBar(widgets=widgets, maxval=N) pbar.start() progress = 0 graph_set_hist = [] for i in xrange(N): g = graph_set[i] hist = label_histogram(g) graph_set_hist.append(hist) progress += 1 pbar.update(progress) pbar.finish() # compute upper triangular kernel matrix widgets = ["Computing K: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " "] pbar = ProgressBar(widgets=widgets, maxval=computation_size) pbar.start() progress = 0 K = np.identity(N) for i in xrange(N): for j in xrange(i + 1, N): k = histogram_intersection(graph_set_hist[i], graph_set_hist[j]) K[i, j] = k progress += 1 pbar.update(progress) pbar.finish() # build lower triangle K = K + K.transpose() - np.identity(len(K)) pz.save(K, "K_{0}.pz".format(r)) K_set.append(K) # normalization of K return sum(K_set) / len(K_set)
def process_apk_dir(dataset_dir): """ Convert a series of APK into FCGNX objects Load all APKs in a dir subtree and create FCG objects that are pickled for later processing and learning. Args: dataset_dir: a directory containing a list of APK files. """ sys.setrecursionlimit(100000) files = [] # check if fcg doesnt exist yet and mark the file to be processed for dirName, subdirList, fileList in os.walk(dataset_dir): for f in fileList: files.append(os.path.join(dirName, f)) # set up progress bar print "\nProcessing {0} APK files in dir {1}".format( len(files), dataset_dir) widgets = [ 'Building CGs: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), ' ' ] pbar = ProgressBar(widgets=widgets, maxval=len(files)) pbar.start() progress = 0 # loop through .apk files and save them in .fcg format for f in files: # f = os.path.join(dataset_dir, fn) print "[] Loading {0}".format(f) try: g = build_fcg_nx(f) # if an exception happens, save the .apk in the corresponding dir except Exception as e: err = e.__class__.__name__ err_dir = err + "/" d = os.path.join(dataset_dir, err_dir) if not os.path.exists(d): os.makedirs(d) cmd = "cp {0} {1}".format(f, d) os.system(cmd) print "[*] {0} error loading {1}".format(err, f) continue h = get_sha256(f) fnx = os.path.join(os.path.split(f)[0], "{0}.fcg.pz".format(h)) pz.save(g, fnx) print "[*] Saved {0}\n".format(fnx) progress += 1 pbar.update(progress) pbar.finish() print "Done."
def process_apk_dir(dataset_dir): """ Convert a series of APK into FCGNX objects Load all APKs in a dir subtree and create FCG objects that are pickled for later processing and learning. Args: dataset_dir: a directory containing a list of APK files. """ sys.setrecursionlimit(100000) files = [] # check if fcg doesnt exist yet and mark the file to be processed for dirName, subdirList, fileList in os.walk(dataset_dir): for f in fileList: files.append(os.path.join(dirName,f)) # set up progress bar print "\nProcessing {0} APK files in dir {1}".format(len(files), dataset_dir) widgets = ['Building CGs: ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA(), ' '] pbar = ProgressBar(widgets=widgets, maxval=len(files)) pbar.start() progress = 0 # loop through .apk files and save them in .fcg format for f in files: # f = os.path.join(dataset_dir, fn) print "[] Loading {0}".format(f) try: g = build_fcg_nx(f) # if an exception happens, save the .apk in the corresponding dir except Exception as e: err = e.__class__.__name__ err_dir = err + "/" d = os.path.join(dataset_dir, err_dir) if not os.path.exists(d): os.makedirs(d) cmd = "cp {0} {1}".format(f, d) os.system(cmd) print "[*] {0} error loading {1}".format(err, f) continue h = get_sha256(f) fnx = os.path.join(os.path.split(f)[0], "{0}.fcg.pz".format(h)) pz.save(g, fnx) print "[*] Saved {0}\n".format(fnx) progress += 1 pbar.update(progress) pbar.finish() print "Done."
def run_linear_experiment(self, rocs_filename, iterations=10): self.rocs = [] for i in xrange(iterations): print "[*] Iteration {0}".format(i) print "[*] Randomizing dataset..." self.randomize_dataset() clf = GridSearchCV(svm.LinearSVC(), {'C':np.logspace(-3,3,7)}) print "[*] Training..." clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.decision_function(self.X_test) print "[*] Testing..." roc = eval.compute_roc(np.float32(out.flatten()), np.float32(self.Y_test)) self.rocs.append(roc) print "[*] ROC saved." pz.save(self.rocs, rocs_filename)
def run_closed_experiment(self, iterations=10): """ Train a classifier on test data, obtain the best combination of paramters through a grid search cross-validation and test the classifier using a closed-world split of the dataset. The results from the number of iterations are saved as pz files. """ self.true_labels = np.array([]) self.predictions = np.array([]) for i in xrange(iterations): self.randomize_dataset_closed_world() clf = GridSearchCV(svm.LinearSVC(), {'C':np.logspace(-3,3,7)}) clf.fit(self.X_train, self.Y_train) out = clf.best_estimator_.predict(self.X_test) self.predictions = np.append(self.predictions, out) self.true_labels = np.append(self.true_labels, self.Y_test) pz.save(self.predictions, "mca_predictions_closed.pz") pz.save(self.true_labels, "mca_true_labels_closed.pz")
def save_data(self): """ Store pz objects for the data matrix, the labels and the name of the original samples so that they can be used in a new experiment without the need to extract all features again """ pz.save(self.X, "X.pz") pz.save(self.Y, "Y.pz") pz.save(self.fnames, "fnames.pz")
def nh_kernel_matrix(graph_set, R=1): """ compute the kernel matrix of a set of graphs using the NHK and label comparison """ N = len(graph_set) K_set = [] computation_size = R * (N**2 - sum(range(N + 1))) print "Total number of graphs: {0}".format(N) print "Total number of graph comparisons: {0}".format(computation_size) for r in xrange(R): #compute neighbor hash for nodes in every graph print "Starting iteration {0}...".format(r) widgets = [ 'Computing NH: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), ' ' ] pbar = ProgressBar(widgets=widgets, maxval=N) pbar.start() progress = 0 for i in xrange(N): graph_set[i] = neighborhood_hash(graph_set[i]) progress += 1 pbar.update(progress) pbar.finish() #precompute the label histogram for each graph widgets = [ 'Computing Label Hist: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), ' ' ] pbar = ProgressBar(widgets=widgets, maxval=N) pbar.start() progress = 0 graph_set_hist = [] for i in xrange(N): g = graph_set[i] hist = label_histogram(g) graph_set_hist.append(hist) progress += 1 pbar.update(progress) pbar.finish() #compute upper triangular kernel matrix widgets = [ 'Computing K: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), ' ' ] pbar = ProgressBar(widgets=widgets, maxval=computation_size) pbar.start() progress = 0 K = np.identity(N) for i in xrange(N): for j in xrange(i + 1, N): k = histogram_intersection(graph_set_hist[i], graph_set_hist[j]) K[i, j] = k progress += 1 pbar.update(progress) pbar.finish() #build lower triangle K = K + K.transpose() - np.identity(len(K)) pz.save(K, "K_{0}.pz".format(r)) K_set.append(K) #normalization of K return sum(K_set) / len(K_set)
for j in nbrs: G.add_edge(i, j - 1) elif DATASET in [NCI1, NCI109]: if len(adjacency_list) != len(weight_lists): print('len(adjacency_list) != len(weight_lists)') exit() for i in xrange(nodes_count): nbrs = adjacency_list[i] weights = [] try: weights = weight_lists[i] except IndexError: exit() if len(nbrs) != len(weights): print('len(nbrs) != len(weights)') exit() nbrs_count = len(nbrs) for j in xrange(nbrs_count): G.add_edge(i, nbrs[j] - 1, weight=weights[j]) elif DATASET == MUTAG: edges_count = len(edges) for i in xrange(edges_count): G.add_edge(edges[i][0] - 1, edges[i][1] - 1, weight=edges[i][2]) pz.save(G, str(graph_num) + ".pz")
elif DATASET in [NCI1, NCI109]: if len(adjacency_list) != len(weight_lists): print('len(adjacency_list) != len(weight_lists)') exit() for i in xrange(nodes_count): nbrs = adjacency_list[i] weights = [] try: weights = weight_lists[i] except IndexError: exit() if len(nbrs) != len(weights): print('len(nbrs) != len(weights)') exit() nbrs_count = len(nbrs) for j in xrange(nbrs_count): G.add_edge(i, nbrs[j] - 1, weight = weights[j]) elif DATASET == MUTAG: edges_count = len(edges) for i in xrange(edges_count): G.add_edge(edges[i][0] - 1, edges[i][1] - 1, weight = edges[i][2]) pz.save(G, str(graph_num) + ".pz")
file_path = os.path.join(cur_path, str(counter) + '.pz') G = nx.Graph() # add nodes to graph G nodes_count = len(cur_node_labels) for i in xrange(nodes_count): G.add_node(i, label = cur_node_labels[i]) # add eddges to graph G for edge in cur_edges: cur_first_edge_node = edge[0] cur_second_edge_node = edge[1] cur_edge_label = edge[2] G.add_edge(cur_first_edge_node, cur_second_edge_node, weight = cur_edge_label) pz.save(G, file_path) if cur_class_label_line == None: break counter += 1 fid.close()
# 2) create a networkx graph corresponding to the parsed graph # ---------------------------------------------------------------------------- cur_path = path_class_1 if cur_class_label == 1 else path_class_minus_1 file_path = os.path.join(cur_path, str(counter) + '.pz') G = nx.Graph() # add nodes to graph G nodes_count = len(cur_node_labels) for i in xrange(nodes_count): G.add_node(i, label=cur_node_labels[i]) # add eddges to graph G for edge in cur_edges: cur_first_edge_node = edge[0] cur_second_edge_node = edge[1] cur_edge_label = edge[2] G.add_edge(cur_first_edge_node, cur_second_edge_node, weight=cur_edge_label) pz.save(G, file_path) if cur_class_label_line == None: break counter += 1 fid.close()