Esempio n. 1
0
    def plot_average_rocs(self, filename, roc_pickles, boundary=0.1):
        """
        Average several ROC curves from different models and plot
        them together in the same figure.

        :filename: name of the file to save the plot
        :boundary: upper False Positive limit for the roc plot
        :roc_pickles: list of pz file names containing several rocs each
        :returns: None. It saves the roc plot in a png file with the
            specified filename
        """

        fps = np.linspace(0.0, 1.0, 10000)
        plt.figure(figsize=(18.5, 10.5))
        linestyles = ['k-', 'k--', 'k-.', 'k:', 'k.',
                      'k*', 'k^', 'ko', 'k+', 'kx']
        for f, style in zip(roc_pickles, linestyles[:len(roc_pickles)]):
            avg_roc, std_roc = eval.average_roc(pz.load(f), fps)
            plt.plot(avg_roc[1], avg_roc[0], style)
        plt.legend(roc_pickles, 'lower right', shadow=True)
        plt.xlabel('False Positive Rate')
        plt.xlim((0.0, boundary))
        plt.ylabel('True Positive Rate')
        plt.ylim((0.0, 1.0))
        plt.title("Average ROCs")
        plt.grid(True)
        plt.savefig(filename, format='png')
Esempio n. 2
0
    def plot_average_rocs(self, filename, roc_pickles, boundary=0.1):
        """
        Average several ROC curves from different models and plot
        them together in the same figure.

        :filename: name of the file to save the plot
        :boundary: upper False Positive limit for the roc plot
        :roc_pickles: list of pz file names containing several rocs each
        :returns: None. It saves the roc plot in a png file with the
            specified filename
        """

        fps = np.linspace(0.0, 1.0, 10000)
        plt.figure(figsize=(18.5, 10.5))
        linestyles = [
            'k-', 'k--', 'k-.', 'k:', 'k.', 'k*', 'k^', 'ko', 'k+', 'kx'
        ]
        for f, style in zip(roc_pickles, linestyles[:len(roc_pickles)]):
            avg_roc, std_roc = eval.average_roc(pz.load(f), fps)
            plt.plot(avg_roc[1], avg_roc[0], style)
        plt.legend(roc_pickles, 'lower right', shadow=True)
        plt.xlabel('False Positive Rate')
        plt.xlim((0.0, boundary))
        plt.ylabel('True Positive Rate')
        plt.ylim((0.0, 1.0))
        plt.title("Average ROCs")
        plt.grid(True)
        plt.savefig(filename, format='png')
Esempio n. 3
0
    def get_high_ranked_neighborhoods(self,
                                      fcg_file,
                                      sorted_weights_idx,
                                      n_weights=3):
        """
        Retrieve the neighborhoods in a hashed graph with maximum weights.
        n_weights: 

        :param fcg_file: path of file containing a fcg
        :param sorted_weights_idx: index that sort the weights from the
                linear classifier
        :param n_weights: number of weights with maximum value to retrieve
                the associated neighborhoods
        :returns: a list of matching neighborhoods.
        """
        # g = FCG.build_fcg(fcg_file)
        g = pz.load(fcg_file)
        g_hash = ml.neighborhood_hash(g)
        bits = len(instructionSet.INSTRUCTION_CLASS_COLOR)

        neighborhoods = []
        remaining_weights = n_weights

        for idx in sorted_weights_idx:
            if remaining_weights > 0:
                label_decimal = idx / self.b
                label_bin = np.binary_repr(label_decimal, bits)
                label = np.array([int(i) for i in label_bin])
                matching_neighborhoods = []
                for m, nh in g_hash.node.iteritems():
                    if np.array_equal(nh["label"], label):
                        neighborhood = "{0} {1}.{2}({3})".format(
                            remaining_weights, m[0], m[1], m[2])
                        matching_neighborhoods.append(neighborhood)
                if matching_neighborhoods:
                    remaining_weights -= 1
                    neighborhoods += matching_neighborhoods
            else:
                del g
                del g_hash
                return neighborhoods
Esempio n. 4
0
    def get_high_ranked_neighborhoods(self, fcg_file,
                                      sorted_weights_idx, n_weights=3):
        """
        Retrieve the neighborhoods in a hashed graph with maximum weights.
        n_weights: 

        :param fcg_file: path of file containing a fcg
        :param sorted_weights_idx: index that sort the weights from the
                linear classifier
        :param n_weights: number of weights with maximum value to retrieve
                the associated neighborhoods
        :returns: a list of matching neighborhoods.
        """
        # g = FCG.build_fcg(fcg_file)
        g = pz.load(fcg_file)
        g_hash = ml.neighborhood_hash(g)
        bits = len(instructionSet.INSTRUCTION_CLASS_COLOR)

        neighborhoods = []
        remaining_weights = n_weights

        for idx in sorted_weights_idx:
            if remaining_weights > 0:
                label_decimal = idx / self.b
                label_bin = np.binary_repr(label_decimal, bits)
                label = np.array([int(i) for i in label_bin])
                matching_neighborhoods = []
                for m, nh in g_hash.node.iteritems():
                    if np.array_equal(nh["label"], label):
                        neighborhood = "{0} {1}.{2}({3})".format(remaining_weights,
                                                                 m[0], m[1], m[2])
                        matching_neighborhoods.append(neighborhood)
                if matching_neighborhoods:
                    remaining_weights -= 1
                    neighborhoods += matching_neighborhoods
            else:
                del g
                del g_hash
                return neighborhoods
Esempio n. 5
0
    def __init__(self, dirs, labels, split, max_files=0, max_node_size=0, 
                 precomputed_matrix="", y="", fnames=""):
        """ 
        The Analysis class allows to load sets of pickled graoh objects
        from different directories where the objects in each directory
        belong to different classes. It also provide the methods to run
        different types of classification experiments by training and 
        testing a linear classifier on the feature vectors generated
        from the different graph objects.

        :dirs: A list with directories including types of files for
            classification e.g. <[MALWARE_DIR, CLEAN_DIR]> or just 
            directories with samples from different malware families
        :labels: The labels assigned to samples in each directory.
            For example a number or a string.
        :split: The percentage of samples used for training (value
            between 0 and 1)
        :precomputed_matrix: name of file if a data or kernel matrix
            has already been computed.
        :y: If precomputed_matrix is True, a pickled and gzipped list
            of labels must be provided.
        :returns: an Analysis object with the dataset as a set of
            properties and several functions to train, test, evaluate
            or run a learning experiment iteratively
        """

        self.split = split
        self.X = []
        self.Y = np.array([])
        self.fnames = []
        self.X_train = [] 
        self.X_test = []
        self.Y_train = []
        self.Y_test = []
        self.clf = ""
        self.out = []
        # self.roc = 0
        self.rocs = []
        self.auc = 0
        self.b = 0
        self.feature_vector_times = []
        self.label_dist = np.zeros(2**15)
        self.sample_sizes = []
        self.neighborhood_sizes = []
        self.class_dist = np.zeros(15)
        self.predictions = []
        self.true_labels = []

        if precomputed_matrix:
            # Load the y labels and file names from zip pickle objects.
            print "Loading matrix..."
            self.X = pz.load(precomputed_matrix)
            print "[*] matrix loaded"
            self.Y = pz.load(y)
            print "[*] labels loaded"
            self.fnames = pz.load(fnames)
            print "[*] file names loaded"

        else:
            # loop over dirs
            for d in zip(dirs, labels):
                files = self.read_files(d[0], "fcg.pz", max_files)
                print "Loading samples in dir {0} with label {1}".format(d[0],
                                                                         d[1])
                widgets = ['Unpickling... : ',
                           Percentage(), ' ',
                           Bar(marker='#', left='[', right=']'),
                           ' ', ETA(), ' ']
                pbar = ProgressBar(widgets=widgets, maxval=len(files))
                pbar.start()
                progress = 0

                # load labels and feature vectors
                for f in files:
                    try: 
                        g = pz.load(f)
                        size = g.number_of_nodes()
                        if size < max_node_size or max_node_size == 0:
                            if size > 0:
                                t0 = time.time()
                                x_i = self.compute_label_histogram(g)
                                # save feature vector computing time for
                                # performance evaluation
                                self.feature_vector_times.append(time.time() -
                                                                 t0)
                                # save distribution of generated labels
                                self.label_dist = np.sum([self.label_dist,
                                                          x_i], axis=0)
                                # save sizes of the sample for further analysis
                                # of the dataset properties
                                self.sample_sizes.append(size)
                                self.neighborhood_sizes += ml.neighborhood_sizes(g)
                                for n, l in g.node.iteritems():
                                    self.class_dist = np.sum([self.class_dist,
                                                              l["label"]], axis=0)
                                # delete nx object to free memory
                                del g
                                self.X.append(x_i)
                                self.Y = np.append(self.Y, [int(d[1])])
                                self.fnames.append(f)
                    except Exception, e:
                        print e
                        print "err: {0}".format(f)
                        pass
                    progress += 1
                    pbar.update(progress)

                pbar.finish()

            # convert feature vectors to its binary representation
            # and make the data matrix sparse
            print "[*] Stacking feature vectors..."
            self.X = np.array(self.X, dtype=np.int16)
            print "[*] Converting features vectors to binary..."
            self.X, self.b = ml.make_binary(self.X) 
Esempio n. 6
0
    def __init__(self,
                 dirs,
                 labels,
                 split,
                 max_files=0,
                 max_node_size=0,
                 precomputed_matrix="",
                 y="",
                 fnames=""):
        """ 
        The Analysis class allows to load sets of pickled graoh objects
        from different directories where the objects in each directory
        belong to different classes. It also provide the methods to run
        different types of classification experiments by training and 
        testing a linear classifier on the feature vectors generated
        from the different graph objects.

        :dirs: A list with directories including types of files for
            classification e.g. <[MALWARE_DIR, CLEAN_DIR]> or just 
            directories with samples from different malware families
        :labels: The labels assigned to samples in each directory.
            For example a number or a string.
        :split: The percentage of samples used for training (value
            between 0 and 1)
        :precomputed_matrix: name of file if a data or kernel matrix
            has already been computed.
        :y: If precomputed_matrix is True, a pickled and gzipped list
            of labels must be provided.
        :returns: an Analysis object with the dataset as a set of
            properties and several functions to train, test, evaluate
            or run a learning experiment iteratively
        """

        self.split = split
        self.X = []
        self.Y = np.array([])
        self.fnames = []
        self.X_train = []
        self.X_test = []
        self.Y_train = []
        self.Y_test = []
        self.clf = ""
        self.out = []
        # self.roc = 0
        self.rocs = []
        self.auc = 0
        self.b = 0
        self.feature_vector_times = []
        self.label_dist = np.zeros(2**15)
        self.sample_sizes = []
        self.neighborhood_sizes = []
        self.class_dist = np.zeros(15)
        self.predictions = []
        self.true_labels = []

        if precomputed_matrix:
            # Load the y labels and file names from zip pickle objects.
            print "Loading matrix..."
            self.X = pz.load(precomputed_matrix)
            print "[*] matrix loaded"
            self.Y = pz.load(y)
            print "[*] labels loaded"
            self.fnames = pz.load(fnames)
            print "[*] file names loaded"

        else:
            # loop over dirs
            for d in zip(dirs, labels):
                files = self.read_files(d[0], "fcg.pz", max_files)
                print "Loading samples in dir {0} with label {1}".format(
                    d[0], d[1])
                widgets = [
                    'Unpickling... : ',
                    Percentage(), ' ',
                    Bar(marker='#', left='[', right=']'), ' ',
                    ETA(), ' '
                ]
                pbar = ProgressBar(widgets=widgets, maxval=len(files))
                pbar.start()
                progress = 0

                # load labels and feature vectors
                for f in files:
                    try:
                        g = pz.load(f)
                        size = g.number_of_nodes()
                        if size < max_node_size or max_node_size == 0:
                            if size > 0:
                                t0 = time.time()
                                x_i = self.compute_label_histogram(g)
                                # save feature vector computing time for
                                # performance evaluation
                                self.feature_vector_times.append(time.time() -
                                                                 t0)
                                # save distribution of generated labels
                                self.label_dist = np.sum(
                                    [self.label_dist, x_i], axis=0)
                                # save sizes of the sample for further analysis
                                # of the dataset properties
                                self.sample_sizes.append(size)
                                self.neighborhood_sizes += ml.neighborhood_sizes(
                                    g)
                                for n, l in g.node.iteritems():
                                    self.class_dist = np.sum(
                                        [self.class_dist, l["label"]], axis=0)
                                # delete nx object to free memory
                                del g
                                self.X.append(x_i)
                                self.Y = np.append(self.Y, [int(d[1])])
                                self.fnames.append(f)
                    except Exception, e:
                        print e
                        print "err: {0}".format(f)
                        pass
                    progress += 1
                    pbar.update(progress)

                pbar.finish()

            # convert feature vectors to its binary representation
            # and make the data matrix sparse
            print "[*] Stacking feature vectors..."
            self.X = np.array(self.X, dtype=np.int16)
            print "[*] Converting features vectors to binary..."
            self.X, self.b = ml.make_binary(self.X)