Example #1
0
def create_int_list_from_x_adv(x_adv: CArray, embedding_value: int,
                               is_shifting_values: bool) -> bytearray:
    """
	Convert CArray sample to list of integers

	Parameters
	----------
	x_adv : CArray
		the sample as a CArray
	embedding_value : int
		the value used for padding the sample
	is_shifting_values : bool
		True if the values are shifted by one

	Returns
	-------
	list
		the sample as list of int
	"""
    invalid_value = 256 if embedding_value == -1 else embedding_value
    padding_positions = x_adv.find(x_adv == invalid_value)
    if padding_positions:
        x_adv = x_adv[:padding_positions[0]]
    if is_shifting_values:
        x_adv = x_adv - 1
    x_adv_edit = x_adv[0, :].astype(np.uint8).flatten().tolist()
    return bytearray(x_adv_edit)
Example #2
0
def create_int_list_from_x_adv(x_adv: CArray, embedding_value, is_shifting_values):
	invalid_value = 256 if embedding_value == -1 else embedding_value
	padding_positions = x_adv.find(x_adv == invalid_value)
	if padding_positions:
		x_adv = x_adv[:padding_positions[0]]
	if is_shifting_values:
		x_adv = x_adv - 1
	x_adv_edit = x_adv[0, :].astype(np.uint8).tolist()[0]
	return bytearray(x_adv_edit)
Example #3
0
 def create_real_sample_from_adv(self,
                                 original_file_path: str,
                                 x_adv: CArray,
                                 new_file_path: str = None):
     with open(original_file_path, 'rb') as f:
         code = bytearray(f.read())
     padding_index = x_adv.find(
         x_adv == self.classifier.get_embedding_value())
     padded_x_adv = copy.copy(x_adv)
     if padding_index:
         padded_x_adv = padded_x_adv[0, :padding_index[0]]
     if self.shift_values:
         padded_x_adv = padded_x_adv - 1
     padded_x_adv = padded_x_adv.astype(np.uint8).flatten().tolist()
     padded_x_adv = b''.join([bytes([i]) for i in padded_x_adv])
     code[:len(padded_x_adv)] = padded_x_adv
     if new_file_path:
         with open(new_file_path, 'wb') as f:
             f.write(code)
     return code
    def predict(self, x: CArray, return_decision_function: bool = True):
        """
		Returns the prediction of the sample (in input space).

		Parameters
		----------
		x : CArray
			The input sample in input space.
		return_decision_function : bool, default True
			If True, it also returns the decision function value, rather than only the label.
			Default is True.
		Returns
		-------
		CArray, (CArray)
			Returns the label of the sample.
			If return_decision_function is True, it also returns the output of the decision function.
		"""
        padding_position = x.find(x == 256)
        if padding_position:
            x = x[0, :padding_position[0]]
        feature_vector = self.extract_features(x)
        return self.classifier.predict(
            feature_vector, return_decision_function=return_decision_function)
    def create_real_sample_from_adv(self,
                                    original_file_path: str,
                                    x_adv: CArray,
                                    new_file_path: str = None) -> bytearray:
        """
		Create a real adversarial example

		Parameters
		----------
		original_file_path : str
			the original malware sample
		x_adv : CArray
			the perturbed malware sample, as created by the optimizer
		new_file_path : str, optional, default None
			the path where to save the adversarial malware. Leave None to not save the result to disk

		Returns
		-------
		bytearray
			the adversarial malware, as string of bytes
		"""
        with open(original_file_path, 'rb') as f:
            code = bytearray(f.read())
        padding_index = x_adv.find(
            x_adv == self.classifier.get_embedding_value())
        padded_x_adv = copy.copy(x_adv)
        if padding_index:
            padded_x_adv = padded_x_adv[0, :padding_index[0]]
        if self.shift_values:
            padded_x_adv = padded_x_adv - 1
        padded_x_adv = padded_x_adv.astype(np.uint8).flatten().tolist()
        padded_x_adv = b''.join([bytes([i]) for i in padded_x_adv])
        code[:len(padded_x_adv)] = padded_x_adv
        if new_file_path:
            with open(new_file_path, 'wb') as f:
                f.write(code)
        return code
Example #6
0
 def init_starting_point(self, x: CArray):
     self._original_x = x
     padding_positions = x.find(x == 256)
     self.clear_results()
     if padding_positions:
         self._original_x = self._original_x[0, :padding_positions[0]]
Example #7
0
    def load(self, ds, digits=tuple(range(0, 10)), num_samples=None):
        """Load all images of specified format inside given path.

        Adapted from: http://cvxopt.org/_downloads/mnist.py

        Extra dataset attributes:
         - 'img_w', 'img_h': size of the images in pixels.
         - 'y_original': array with the original labels (before renumbering)

        Parameters
        ----------
        ds : str
            Identifier of the dataset to download,
            either 'training' or 'testing'.
        digits : tuple
            Tuple with the digits to load. By default all digits are loaded.
        num_samples : int or None, optional
            Number of expected samples in resulting ds.
            If int, an equal number of samples will be taken
            from each class until `num_samples` have been loaded.
            If None, all samples will be loaded.

        """
        if ds == "training":
            data_path = self.train_data_path
            lbl_path = self.train_labels_path
        elif ds == "testing":
            data_path = self.test_data_path
            lbl_path = self.test_labels_path
        else:
            raise ValueError("ds must be 'training' or 'testing'")

        self.logger.info("Loading MNIST {:} dataset from {:}...".format(
            ds, MNIST_PATH))

        # Opening the labels data
        flbl = open(lbl_path, 'rb')
        magic_nr, size = struct.unpack(">II", flbl.read(8))
        if magic_nr != 2049:
            raise ValueError('Magic number mismatch, expected 2049,'
                             'got {}'.format(magic_nr))
        lbl = array("b", flbl.read())
        flbl.close()

        # Opening the images data
        fimg = open(data_path, 'rb')
        magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
        if magic_nr != 2051:
            raise ValueError('Magic number mismatch, expected 2051,'
                             'got {}'.format(magic_nr))
        img = array("B", fimg.read())
        fimg.close()

        # Convert digits to tuple in case was passed as array/list
        digits = tuple(digits)

        # Number of samples per class
        if num_samples is not None:
            div = len(digits)
            n_samples_class = [
                int(num_samples / div) + (1 if x < num_samples % div else 0)
                for x in range(div)
            ]
            n_samples_class = {
                e: n_samples_class[e_i]
                for e_i, e in enumerate(digits)
            }
        else:  # No constraint on the number of samples
            n_samples_class = {e: size for e in digits}

        # Counter of already taken sample for a class
        count_samples_class = {e: 0 for e in digits}

        # Extract the indices of samples to load
        ind = []
        for k in range(size):
            if lbl[k] in digits:
                # Check the maximum number of samples for current digits
                if count_samples_class[lbl[k]] < n_samples_class[lbl[k]]:
                    ind += [k]
                    count_samples_class[lbl[k]] += 1

        # Number of loaded samples
        num_loaded = sum(count_samples_class.values())

        # Check if dataset has enough samples
        if num_samples is not None and num_loaded < num_samples:
            min_val = min(count_samples_class.values())
            raise ValueError(
                "not enough samples in dataset for one ore more of the "
                "desired classes ({:} available)".format(min_val))

        images = CArray.zeros((len(ind), rows * cols), dtype=np.uint8)
        labels = CArray.zeros(len(ind), dtype=int)
        digs_array = CArray(digits)  # To use find method
        for i in range(len(ind)):
            images[i, :] = CArray(img[ind[i] * rows * cols:(ind[i] + 1) *
                                      rows * cols])
            labels[i] = CArray(digs_array.find(digs_array == lbl[ind[i]]))

        header = CDatasetHeader(img_w=28, img_h=28, y_original=digits)

        return CDataset(images, labels, header=header)
    def compute_indices(self, dataset):
        """Compute training set and test set indices.

        Parameters
        ----------
        dataset : CDataset
            Dataset to split.

        Returns
        -------
        tr_idx, ts_idx : CArray
            Flat arrays with the tr/ts indices.

        """
        if not hasattr(dataset.header, 'timestamp') or \
                not hasattr(dataset.header, 'timestamp_fmt'):
            raise AttributeError("dataset must contain `timestamp` and "
                                 "'timestamp_fmt' information")

        timestamps = dataset.header.timestamp
        fmt = dataset.header.timestamp_fmt

        # Pick the samples having `timestamp <= th` to build the training set
        tr_mask = CArray(list(map(
            lambda tstmp: datetime.strptime(tstmp, fmt) <= self.th_timestamp,
            timestamps)))
        # Test set samples are all the other samples
        ts_mask = tr_mask.logical_not()

        # Compute the number of train/test samples
        max_tr = tr_mask.sum()
        max_ts = dataset.num_samples - max_tr

        if max_tr == 0:
            raise ValueError("no samples with timestamp <= {:}. "
                             "Cannot split dataset.".format(self.th_timestamp))

        if max_ts == 0:
            raise ValueError("no samples with timestamp > {:}. "
                             "Cannot split dataset.".format(self.th_timestamp))

        # Compute the actual number of desired train/test samples

        if is_int(self.train_size):
            if self.train_size < 1 or self.train_size > max_tr:
                raise ValueError(
                    "train_size should be between 1 and {:}".format(max_tr))
            else:  # train_size is a valid integer, use it directly
                tr_size = self.train_size
        else:  # Compute the proportion of train samples (at least 1)
            tr_size = int(max(1, round(max_tr * self.train_size)))

        if is_int(self.test_size):
            if self.test_size < 1 or self.test_size > max_ts:
                raise ValueError(
                    "test_size should be between 1 and {:}".format(max_ts))
            else:  # test_size is a valid integer, use it directly
                ts_size = self.test_size
        else:  # Compute the proportion of train samples (at least 1)
            ts_size = int(max(1, round(max_ts * self.test_size)))

        # Get the indices of samples from boolean masks
        tr_idx = CArray(tr_mask.find(tr_mask))
        ts_idx = CArray(ts_mask.find(ts_mask))

        # Get the subset of indices to include in train/test set
        # If shuffle is True, randomize the indices

        if self.shuffle is True:
            tr_idx = CArray.randsample(
                tr_idx, shape=(tr_size, ), random_state=self.random_state)
            ts_idx = CArray.randsample(
                ts_idx, shape=(ts_size, ), random_state=self.random_state)
        else:  # Just slice the arrays of indices
            tr_idx = tr_idx[:tr_size]
            ts_idx = ts_idx[:ts_size]

        self._tr_idx = tr_idx
        self._ts_idx = ts_idx

        return self.tr_idx, self.ts_idx