Beispiel #1
0
 def run_open_experiment(self, iterations=10):
     """ Train a classifier on test data, obtain the best combination of
     paramters through a grid search cross-validation and test the classifier
     using a open-world split of the dataset. The results from the number
     of iterations are saved as pz files.
     """
     self.true_labels = np.array([])
     self.predictions = np.array([])
     for i in xrange(iterations):
         self.randomize_dataset_open_world()  
         clf = GridSearchCV(svm.LinearSVC(), {'C':np.logspace(-3,3,7)})
         clf.fit(self.X_train, self.Y_train)
         out = clf.best_estimator_.decision_function(self.X_test)
         classes = clf.best_estimator_.classes_
         for scores in out:
             m = np.max(scores)
             if (abs(m/scores[:][:]) < 0.5).any():
                 self.predictions = np.append(self.predictions, 99)
             else:
                 p = classes[np.where(scores==m)]
                 self.predictions = np.append(self.predictions, p)
         self.true_labels = np.append(self.true_labels, self.Y_test)
 
     pz.save(self.predictions, "mca_predictions_open.pz")
     pz.save(self.true_labels, "mca_true_labels_open.pz")
Beispiel #2
0
def nh_kernel_matrix(graph_set, R=1):
    """ compute the kernel matrix of a set of graphs using the NHK and label
    comparison """

    N = len(graph_set)
    K_set = []

    computation_size = R * (N ** 2 - sum(range(N + 1)))
    print "Total number of graphs: {0}".format(N)
    print "Total number of graph comparisons: {0}".format(computation_size)

    for r in xrange(R):

        # compute neighbor hash for nodes in every graph
        print "Starting iteration {0}...".format(r)
        widgets = ["Computing NH: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " "]
        pbar = ProgressBar(widgets=widgets, maxval=N)
        pbar.start()
        progress = 0
        for i in xrange(N):
            graph_set[i] = neighborhood_hash(graph_set[i])
            progress += 1
            pbar.update(progress)
        pbar.finish()

        # precompute the label histogram for each graph
        widgets = ["Computing Label Hist: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " "]
        pbar = ProgressBar(widgets=widgets, maxval=N)
        pbar.start()
        progress = 0
        graph_set_hist = []
        for i in xrange(N):
            g = graph_set[i]
            hist = label_histogram(g)
            graph_set_hist.append(hist)
            progress += 1
            pbar.update(progress)
        pbar.finish()

        # compute upper triangular kernel matrix
        widgets = ["Computing K: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " "]
        pbar = ProgressBar(widgets=widgets, maxval=computation_size)
        pbar.start()
        progress = 0
        K = np.identity(N)
        for i in xrange(N):
            for j in xrange(i + 1, N):
                k = histogram_intersection(graph_set_hist[i], graph_set_hist[j])
                K[i, j] = k
                progress += 1
                pbar.update(progress)
        pbar.finish()
        # build lower triangle
        K = K + K.transpose() - np.identity(len(K))
        pz.save(K, "K_{0}.pz".format(r))
        K_set.append(K)

    # normalization of K
    return sum(K_set) / len(K_set)
Beispiel #3
0
def process_apk_dir(dataset_dir):
    """ Convert a series of APK into FCGNX objects

    Load all APKs in a dir subtree and create FCG objects that are
    pickled for later processing and learning. 

    Args:
        dataset_dir: a directory containing a list of APK files.

    """
    sys.setrecursionlimit(100000)
    files = []

    # check if fcg doesnt exist yet and mark the file to be processed
    for dirName, subdirList, fileList in os.walk(dataset_dir):
        for f in fileList:
            files.append(os.path.join(dirName, f))

    # set up progress bar
    print "\nProcessing {0} APK files in dir {1}".format(
        len(files), dataset_dir)
    widgets = [
        'Building CGs: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA(), ' '
    ]
    pbar = ProgressBar(widgets=widgets, maxval=len(files))
    pbar.start()
    progress = 0

    # loop through .apk files and save them in .fcg format
    for f in files:
        # f = os.path.join(dataset_dir, fn)
        print "[] Loading {0}".format(f)
        try:
            g = build_fcg_nx(f)
        # if an exception happens, save the .apk in the corresponding dir
        except Exception as e:
            err = e.__class__.__name__
            err_dir = err + "/"
            d = os.path.join(dataset_dir, err_dir)
            if not os.path.exists(d):
                os.makedirs(d)
            cmd = "cp {0} {1}".format(f, d)
            os.system(cmd)
            print "[*] {0} error loading {1}".format(err, f)
            continue

        h = get_sha256(f)
        fnx = os.path.join(os.path.split(f)[0], "{0}.fcg.pz".format(h))
        pz.save(g, fnx)
        print "[*] Saved {0}\n".format(fnx)
        progress += 1
        pbar.update(progress)
    pbar.finish()
    print "Done."
Beispiel #4
0
def process_apk_dir(dataset_dir):
    """ Convert a series of APK into FCGNX objects

    Load all APKs in a dir subtree and create FCG objects that are
    pickled for later processing and learning. 

    Args:
        dataset_dir: a directory containing a list of APK files.

    """
    sys.setrecursionlimit(100000)
    files = []

    # check if fcg doesnt exist yet and mark the file to be processed
    for dirName, subdirList, fileList in os.walk(dataset_dir):
        for f in fileList:
            files.append(os.path.join(dirName,f))

    # set up progress bar            
    print "\nProcessing {0} APK files in dir {1}".format(len(files), dataset_dir)
    widgets = ['Building CGs: ', Percentage(), ' ', Bar(marker='#',left='[',right=']'),
                       ' ', ETA(), ' ']
    pbar = ProgressBar(widgets=widgets, maxval=len(files))
    pbar.start()
    progress = 0

    # loop through .apk files and save them in .fcg format
    for f in files:
        # f = os.path.join(dataset_dir, fn)
        print "[] Loading {0}".format(f)
        try:
            g = build_fcg_nx(f)
        # if an exception happens, save the .apk in the corresponding dir
        except Exception as e:
            err = e.__class__.__name__
            err_dir = err + "/"
            d = os.path.join(dataset_dir, err_dir)
            if not os.path.exists(d):
                os.makedirs(d)
            cmd = "cp {0} {1}".format(f, d)
            os.system(cmd)
            print "[*] {0} error loading {1}".format(err, f)
            continue

        h = get_sha256(f)
        fnx = os.path.join(os.path.split(f)[0], "{0}.fcg.pz".format(h))
        pz.save(g, fnx)
        print "[*] Saved {0}\n".format(fnx)
        progress += 1
        pbar.update(progress)
    pbar.finish()
    print "Done."
Beispiel #5
0
    def run_linear_experiment(self, rocs_filename, iterations=10):

        self.rocs = []
        for i in xrange(iterations):
            print "[*] Iteration {0}".format(i)
            print "[*] Randomizing dataset..."
            self.randomize_dataset()
            clf = GridSearchCV(svm.LinearSVC(), {'C':np.logspace(-3,3,7)})
            print "[*] Training..."
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.decision_function(self.X_test)
            print "[*] Testing..."
            roc = eval.compute_roc(np.float32(out.flatten()), np.float32(self.Y_test))
            self.rocs.append(roc)
            print "[*] ROC saved."
        pz.save(self.rocs, rocs_filename)
Beispiel #6
0
    def run_closed_experiment(self, iterations=10):
        """ Train a classifier on test data, obtain the best combination of
        paramters through a grid search cross-validation and test the classifier
        using a closed-world split of the dataset. The results from the number
        of iterations are saved as pz files.
        """
        self.true_labels = np.array([])
        self.predictions = np.array([])
        for i in xrange(iterations):
            self.randomize_dataset_closed_world()
            clf = GridSearchCV(svm.LinearSVC(), {'C':np.logspace(-3,3,7)})
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.predict(self.X_test)
            self.predictions = np.append(self.predictions, out)
            self.true_labels = np.append(self.true_labels, self.Y_test)

        pz.save(self.predictions, "mca_predictions_closed.pz")
        pz.save(self.true_labels, "mca_true_labels_closed.pz")
Beispiel #7
0
 def save_data(self):
     """ Store pz objects for the data matrix, the labels and
         the name of the original samples so that they can be used
         in a new experiment without the need to extract all
         features again
     """
     pz.save(self.X, "X.pz")
     pz.save(self.Y, "Y.pz")
     pz.save(self.fnames, "fnames.pz")
Beispiel #8
0
def nh_kernel_matrix(graph_set, R=1):
    """ compute the kernel matrix of a set of graphs using the NHK and label comparison """

    N = len(graph_set)
    K_set = []

    computation_size = R * (N**2 - sum(range(N + 1)))
    print "Total number of graphs: {0}".format(N)
    print "Total number of graph comparisons: {0}".format(computation_size)

    for r in xrange(R):

        #compute neighbor hash for nodes in every graph
        print "Starting iteration {0}...".format(r)
        widgets = [
            'Computing NH: ',
            Percentage(), ' ',
            Bar(marker='#', left='[', right=']'), ' ',
            ETA(), ' '
        ]
        pbar = ProgressBar(widgets=widgets, maxval=N)
        pbar.start()
        progress = 0
        for i in xrange(N):
            graph_set[i] = neighborhood_hash(graph_set[i])
            progress += 1
            pbar.update(progress)
        pbar.finish()

        #precompute the label histogram for each graph
        widgets = [
            'Computing Label Hist: ',
            Percentage(), ' ',
            Bar(marker='#', left='[', right=']'), ' ',
            ETA(), ' '
        ]
        pbar = ProgressBar(widgets=widgets, maxval=N)
        pbar.start()
        progress = 0
        graph_set_hist = []
        for i in xrange(N):
            g = graph_set[i]
            hist = label_histogram(g)
            graph_set_hist.append(hist)
            progress += 1
            pbar.update(progress)
        pbar.finish()

        #compute upper triangular kernel matrix
        widgets = [
            'Computing K: ',
            Percentage(), ' ',
            Bar(marker='#', left='[', right=']'), ' ',
            ETA(), ' '
        ]
        pbar = ProgressBar(widgets=widgets, maxval=computation_size)
        pbar.start()
        progress = 0
        K = np.identity(N)
        for i in xrange(N):
            for j in xrange(i + 1, N):
                k = histogram_intersection(graph_set_hist[i],
                                           graph_set_hist[j])
                K[i, j] = k
                progress += 1
                pbar.update(progress)
        pbar.finish()
        #build lower triangle
        K = K + K.transpose() - np.identity(len(K))
        pz.save(K, "K_{0}.pz".format(r))
        K_set.append(K)

    #normalization of K
    return sum(K_set) / len(K_set)
Beispiel #9
0
            for j in nbrs:
                G.add_edge(i, j - 1)

    elif DATASET in [NCI1, NCI109]:
        if len(adjacency_list) != len(weight_lists):
            print('len(adjacency_list) != len(weight_lists)')
            exit()

        for i in xrange(nodes_count):
            nbrs = adjacency_list[i]
            weights = []
            try:
                weights = weight_lists[i]
            except IndexError:
                exit()

            if len(nbrs) != len(weights):
                print('len(nbrs) != len(weights)')
                exit()

            nbrs_count = len(nbrs)
            for j in xrange(nbrs_count):
                G.add_edge(i, nbrs[j] - 1, weight=weights[j])

    elif DATASET == MUTAG:
        edges_count = len(edges)
        for i in xrange(edges_count):
            G.add_edge(edges[i][0] - 1, edges[i][1] - 1, weight=edges[i][2])

    pz.save(G, str(graph_num) + ".pz")
         
 elif DATASET in [NCI1, NCI109]:
     if len(adjacency_list) != len(weight_lists):
         print('len(adjacency_list) != len(weight_lists)')
         exit()
     
     for i in xrange(nodes_count):
         nbrs = adjacency_list[i]
         weights = []
         try:
             weights = weight_lists[i]
         except IndexError:
             exit()
             
         if len(nbrs) != len(weights):
             print('len(nbrs) != len(weights)')
             exit()
         
         nbrs_count = len(nbrs)
         for j in xrange(nbrs_count):
             G.add_edge(i, nbrs[j] - 1, weight = weights[j]) 
 
 elif DATASET == MUTAG:   
     edges_count = len(edges)
     for i in xrange(edges_count):
         G.add_edge(edges[i][0] - 1, edges[i][1] - 1,
         weight = edges[i][2])
 
     
 pz.save(G, str(graph_num) + ".pz")
    file_path = os.path.join(cur_path, str(counter) + '.pz')
    
    G = nx.Graph()
    
    # add nodes to graph G
    nodes_count = len(cur_node_labels)
    
    for i in xrange(nodes_count):
        G.add_node(i, label = cur_node_labels[i])
        
    # add eddges to graph G
    for edge in cur_edges:
        cur_first_edge_node = edge[0]
        cur_second_edge_node = edge[1]
        cur_edge_label = edge[2]
        G.add_edge(cur_first_edge_node, cur_second_edge_node,
                   weight = cur_edge_label)    
        

    pz.save(G, file_path)

    if cur_class_label_line == None:
        break
    
    counter += 1


fid.close()        


Beispiel #12
0
    # 2) create a networkx graph corresponding to the parsed graph
    # ----------------------------------------------------------------------------
    cur_path = path_class_1 if cur_class_label == 1 else path_class_minus_1
    file_path = os.path.join(cur_path, str(counter) + '.pz')

    G = nx.Graph()

    # add nodes to graph G
    nodes_count = len(cur_node_labels)

    for i in xrange(nodes_count):
        G.add_node(i, label=cur_node_labels[i])

    # add eddges to graph G
    for edge in cur_edges:
        cur_first_edge_node = edge[0]
        cur_second_edge_node = edge[1]
        cur_edge_label = edge[2]
        G.add_edge(cur_first_edge_node,
                   cur_second_edge_node,
                   weight=cur_edge_label)

    pz.save(G, file_path)

    if cur_class_label_line == None:
        break

    counter += 1

fid.close()