예제 #1
0
파일: analysis.py 프로젝트: sherplus/adagio
    def run_linear_open_experiment(self, iterations=10, save=False):
        """
        Train a classifier on test data, obtain the best combination of
        parameters through a grid search cross-validation and test the
        classifier using a open-world split of the dataset. The results
        from the number of iterations are saved as pz files.

        :param iterations: number of runs (training/testing)
        :save: save predictions and labels if True
        """
        self.true_labels = np.array([])
        self.predictions = np.array([])
        for i in xrange(iterations):
            self.randomize_dataset_open_world()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.decision_function(self.X_test)
            classes = clf.best_estimator_.classes_
            for scores in out:
                m = np.max(scores)
                if (abs(m / scores[:][:]) < 0.5).any():
                    self.predictions = np.append(self.predictions, 99)
                else:
                    p = classes[np.where(scores == m)]
                    self.predictions = np.append(self.predictions, p)
            self.true_labels = np.append(self.true_labels, self.Y_test)

        if save:
            pz.save(self.predictions, "mca_predictions_open.pz")
            pz.save(self.true_labels, "mca_true_labels_open.pz")
예제 #2
0
    def run_linear_open_experiment(self, iterations=10, save=False):
        """
        Train a classifier on test data, obtain the best combination of
        parameters through a grid search cross-validation and test the
        classifier using a open-world split of the dataset. The results
        from the number of iterations are saved as pz files.

        :param iterations: number of runs (training/testing)
        :save: save predictions and labels if True
        """
        self.true_labels = np.array([])
        self.predictions = np.array([])
        for i in xrange(iterations):
            self.randomize_dataset_open_world()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.decision_function(self.X_test)
            classes = clf.best_estimator_.classes_
            for scores in out:
                m = np.max(scores)
                if (abs(m/scores[:][:]) < 0.5).any():
                    self.predictions = np.append(self.predictions, 99)
                else:
                    p = classes[np.where(scores==m)]
                    self.predictions = np.append(self.predictions, p)
            self.true_labels = np.append(self.true_labels, self.Y_test)

        if save:
            pz.save(self.predictions, "mca_predictions_open.pz")
            pz.save(self.true_labels, "mca_true_labels_open.pz")
예제 #3
0
    def run_linear_experiment(self, rocs_filename, iterations=10):
        """
        Run a classification experiment by running several iterations.
        In each iteration data is randomized, a linear svm classifier
        is trained and evaluated using cross-validation over a the 
        cost parameter in the range np.logspace(-3, 3, 7). The best
        classifier is used for testing and a ROC curve is computed
        and saved as property and locally.

        :param rocs_filename: the file to save all rocs computed
        :param iterations: number of runs (training/testing)
        """
        for i in xrange(iterations):
            print "[*] Iteration {0}".format(i)
            print "[*] Randomizing dataset..."
            self.randomize_dataset()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            print "[*] Training..."
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.decision_function(self.X_test)
            print "[*] Testing..."
            roc = eval.compute_roc(np.float32(out.flatten()),
                                   np.float32(self.Y_test))
            self.rocs.append(roc)
            print "[*] ROC saved."
        pz.save(self.rocs, rocs_filename)
예제 #4
0
파일: analysis.py 프로젝트: sherplus/adagio
    def run_linear_experiment(self, rocs_filename, iterations=10):
        """
        Run a classification experiment by running several iterations.
        In each iteration data is randomized, a linear svm classifier
        is trained and evaluated using cross-validation over a the 
        cost parameter in the range np.logspace(-3, 3, 7). The best
        classifier is used for testing and a ROC curve is computed
        and saved as property and locally.

        :param rocs_filename: the file to save all rocs computed
        :param iterations: number of runs (training/testing)
        """
        for i in xrange(iterations):
            print "[*] Iteration {0}".format(i)
            print "[*] Randomizing dataset..."
            self.randomize_dataset()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            print "[*] Training..."
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.decision_function(self.X_test)
            print "[*] Testing..."
            roc = eval.compute_roc(np.float32(out.flatten()),
                                   np.float32(self.Y_test))
            self.rocs.append(roc)
            print "[*] ROC saved."
        pz.save(self.rocs, rocs_filename)
예제 #5
0
파일: graphs.py 프로젝트: sherplus/adagio
def process_dir(read_dir, out_dir, mode='FCG'):
    """ Convert a series of APK into graph objects. Load all
    APKs in a dir subtree and create graph objects that are pickled
    for later processing and learning.
    """
    sys.setrecursionlimit(100000)
    files = []

    # check if pdg doesnt exist yet and mark the file to be processed
    for dirName, subdirList, fileList in os.walk(read_dir):
        for f in fileList:
            files.append(os.path.join(dirName, f))

    # set up progress bar
    print "\nProcessing {} APK files in dir {}".format(len(files), read_dir)
    widgets = ['Building graphs: ',
               Percentage(), ' ',
               Bar(marker='#', left='[', right=']'),
               ' ', ETA(), ' ']

    pbar = ProgressBar(widgets=widgets, maxval=len(files))
    pbar.start()
    progress = 0

    # loop through .apk files and save them in .pdg.pz format
    for f in files:

        f = os.path.realpath(f)
        print '[] Loading {0}'.format(f)
        try:
            if mode is 'FCG':
                graph = FCG(f)
            elif mode is 'PDG':
                graph = PDG(f)

        # if an exception happens, save the .apk in the corresponding dir
        except Exception as e:
            err = e.__class__.__name__
            err_dir = err + "/"
            d = os.path.join(read_dir, err_dir)
            if not os.path.exists(d):
                os.makedirs(d)
            cmd = "cp {} {}".format(f, d)
            os.system(cmd)
            print "[*] {} error loading {}".format(err, f)
            continue

        h = get_sha256(f)
        if out_dir:
            out = out_dir
        else:
            out = read_dir
        fnx = os.path.join(out, "{}.pz".format(h))
        pz.save(graph.g, fnx)
        print "[*] Saved {}\n".format(fnx)
        progress += 1
        pbar.update(progress)
    pbar.finish()
    print "Done."
예제 #6
0
    def run_linear_closed_experiment(self, iterations=10, save=False):
        """
        Train a classifier on test data, obtain the best combination of
        parameters through a grid search cross-validation and test the
        classifier using a closed-world split of the dataset. The results
        from the number of iterations are saved as pz files.

        :param iterations: number of runs (training/testing)
        :save: save predictions and labels if True
        """
        self.true_labels = np.array([])
        self.predictions = np.array([])
        for i in xrange(iterations):
            self.randomize_dataset_closed_world()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.predict(self.X_test)
            self.predictions = np.append(self.predictions, out)
            self.true_labels = np.append(self.true_labels, self.Y_test)

        if save:
            pz.save(self.predictions, "mca_predictions_closed.pz")
            pz.save(self.true_labels, "mca_true_labels_closed.pz")
예제 #7
0
파일: analysis.py 프로젝트: sherplus/adagio
    def run_linear_closed_experiment(self, iterations=10, save=False):
        """
        Train a classifier on test data, obtain the best combination of
        parameters through a grid search cross-validation and test the
        classifier using a closed-world split of the dataset. The results
        from the number of iterations are saved as pz files.

        :param iterations: number of runs (training/testing)
        :save: save predictions and labels if True
        """
        self.true_labels = np.array([])
        self.predictions = np.array([])
        for i in xrange(iterations):
            self.randomize_dataset_closed_world()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.predict(self.X_test)
            self.predictions = np.append(self.predictions, out)
            self.true_labels = np.append(self.true_labels, self.Y_test)

        if save:
            pz.save(self.predictions, "mca_predictions_closed.pz")
            pz.save(self.true_labels, "mca_true_labels_closed.pz")
예제 #8
0
 def save_data(self):
     """ Store pz objects for the data matrix, the labels and
         the name of the original samples so that they can be used
         in a new experiment without the need to extract all
         features again
     """
     print "[*] Saving labels, data matrix and file names..."
     pz.save(self.X, "X.pz")
     pz.save(self.Y, "Y.pz")
     pz.save(self.fnames, "fnames.pz")
예제 #9
0
파일: analysis.py 프로젝트: sherplus/adagio
 def save_data(self):
     """ Store pz objects for the data matrix, the labels and
         the name of the original samples so that they can be used
         in a new experiment without the need to extract all
         features again
     """
     print "[*] Saving labels, data matrix and file names..."
     pz.save(self.X, "X.pz")
     pz.save(self.Y, "Y.pz")
     pz.save(self.fnames, "fnames.pz")