def create_int_list_from_x_adv(x_adv: CArray, embedding_value: int, is_shifting_values: bool) -> bytearray: """ Convert CArray sample to list of integers Parameters ---------- x_adv : CArray the sample as a CArray embedding_value : int the value used for padding the sample is_shifting_values : bool True if the values are shifted by one Returns ------- list the sample as list of int """ invalid_value = 256 if embedding_value == -1 else embedding_value padding_positions = x_adv.find(x_adv == invalid_value) if padding_positions: x_adv = x_adv[:padding_positions[0]] if is_shifting_values: x_adv = x_adv - 1 x_adv_edit = x_adv[0, :].astype(np.uint8).flatten().tolist() return bytearray(x_adv_edit)
def create_int_list_from_x_adv(x_adv: CArray, embedding_value, is_shifting_values): invalid_value = 256 if embedding_value == -1 else embedding_value padding_positions = x_adv.find(x_adv == invalid_value) if padding_positions: x_adv = x_adv[:padding_positions[0]] if is_shifting_values: x_adv = x_adv - 1 x_adv_edit = x_adv[0, :].astype(np.uint8).tolist()[0] return bytearray(x_adv_edit)
def create_real_sample_from_adv(self, original_file_path: str, x_adv: CArray, new_file_path: str = None): with open(original_file_path, 'rb') as f: code = bytearray(f.read()) padding_index = x_adv.find( x_adv == self.classifier.get_embedding_value()) padded_x_adv = copy.copy(x_adv) if padding_index: padded_x_adv = padded_x_adv[0, :padding_index[0]] if self.shift_values: padded_x_adv = padded_x_adv - 1 padded_x_adv = padded_x_adv.astype(np.uint8).flatten().tolist() padded_x_adv = b''.join([bytes([i]) for i in padded_x_adv]) code[:len(padded_x_adv)] = padded_x_adv if new_file_path: with open(new_file_path, 'wb') as f: f.write(code) return code
def predict(self, x: CArray, return_decision_function: bool = True): """ Returns the prediction of the sample (in input space). Parameters ---------- x : CArray The input sample in input space. return_decision_function : bool, default True If True, it also returns the decision function value, rather than only the label. Default is True. Returns ------- CArray, (CArray) Returns the label of the sample. If return_decision_function is True, it also returns the output of the decision function. """ padding_position = x.find(x == 256) if padding_position: x = x[0, :padding_position[0]] feature_vector = self.extract_features(x) return self.classifier.predict( feature_vector, return_decision_function=return_decision_function)
def create_real_sample_from_adv(self, original_file_path: str, x_adv: CArray, new_file_path: str = None) -> bytearray: """ Create a real adversarial example Parameters ---------- original_file_path : str the original malware sample x_adv : CArray the perturbed malware sample, as created by the optimizer new_file_path : str, optional, default None the path where to save the adversarial malware. Leave None to not save the result to disk Returns ------- bytearray the adversarial malware, as string of bytes """ with open(original_file_path, 'rb') as f: code = bytearray(f.read()) padding_index = x_adv.find( x_adv == self.classifier.get_embedding_value()) padded_x_adv = copy.copy(x_adv) if padding_index: padded_x_adv = padded_x_adv[0, :padding_index[0]] if self.shift_values: padded_x_adv = padded_x_adv - 1 padded_x_adv = padded_x_adv.astype(np.uint8).flatten().tolist() padded_x_adv = b''.join([bytes([i]) for i in padded_x_adv]) code[:len(padded_x_adv)] = padded_x_adv if new_file_path: with open(new_file_path, 'wb') as f: f.write(code) return code
def init_starting_point(self, x: CArray): self._original_x = x padding_positions = x.find(x == 256) self.clear_results() if padding_positions: self._original_x = self._original_x[0, :padding_positions[0]]
def load(self, ds, digits=tuple(range(0, 10)), num_samples=None): """Load all images of specified format inside given path. Adapted from: http://cvxopt.org/_downloads/mnist.py Extra dataset attributes: - 'img_w', 'img_h': size of the images in pixels. - 'y_original': array with the original labels (before renumbering) Parameters ---------- ds : str Identifier of the dataset to download, either 'training' or 'testing'. digits : tuple Tuple with the digits to load. By default all digits are loaded. num_samples : int or None, optional Number of expected samples in resulting ds. If int, an equal number of samples will be taken from each class until `num_samples` have been loaded. If None, all samples will be loaded. """ if ds == "training": data_path = self.train_data_path lbl_path = self.train_labels_path elif ds == "testing": data_path = self.test_data_path lbl_path = self.test_labels_path else: raise ValueError("ds must be 'training' or 'testing'") self.logger.info("Loading MNIST {:} dataset from {:}...".format( ds, MNIST_PATH)) # Opening the labels data flbl = open(lbl_path, 'rb') magic_nr, size = struct.unpack(">II", flbl.read(8)) if magic_nr != 2049: raise ValueError('Magic number mismatch, expected 2049,' 'got {}'.format(magic_nr)) lbl = array("b", flbl.read()) flbl.close() # Opening the images data fimg = open(data_path, 'rb') magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16)) if magic_nr != 2051: raise ValueError('Magic number mismatch, expected 2051,' 'got {}'.format(magic_nr)) img = array("B", fimg.read()) fimg.close() # Convert digits to tuple in case was passed as array/list digits = tuple(digits) # Number of samples per class if num_samples is not None: div = len(digits) n_samples_class = [ int(num_samples / div) + (1 if x < num_samples % div else 0) for x in range(div) ] n_samples_class = { e: n_samples_class[e_i] for e_i, e in enumerate(digits) } else: # No constraint on the number of samples n_samples_class = {e: size for e in digits} # Counter of already taken sample for a class count_samples_class = {e: 0 for e in digits} # Extract the indices of samples to load ind = [] for k in range(size): if lbl[k] in digits: # Check the maximum number of samples for current digits if count_samples_class[lbl[k]] < n_samples_class[lbl[k]]: ind += [k] count_samples_class[lbl[k]] += 1 # Number of loaded samples num_loaded = sum(count_samples_class.values()) # Check if dataset has enough samples if num_samples is not None and num_loaded < num_samples: min_val = min(count_samples_class.values()) raise ValueError( "not enough samples in dataset for one ore more of the " "desired classes ({:} available)".format(min_val)) images = CArray.zeros((len(ind), rows * cols), dtype=np.uint8) labels = CArray.zeros(len(ind), dtype=int) digs_array = CArray(digits) # To use find method for i in range(len(ind)): images[i, :] = CArray(img[ind[i] * rows * cols:(ind[i] + 1) * rows * cols]) labels[i] = CArray(digs_array.find(digs_array == lbl[ind[i]])) header = CDatasetHeader(img_w=28, img_h=28, y_original=digits) return CDataset(images, labels, header=header)
def compute_indices(self, dataset): """Compute training set and test set indices. Parameters ---------- dataset : CDataset Dataset to split. Returns ------- tr_idx, ts_idx : CArray Flat arrays with the tr/ts indices. """ if not hasattr(dataset.header, 'timestamp') or \ not hasattr(dataset.header, 'timestamp_fmt'): raise AttributeError("dataset must contain `timestamp` and " "'timestamp_fmt' information") timestamps = dataset.header.timestamp fmt = dataset.header.timestamp_fmt # Pick the samples having `timestamp <= th` to build the training set tr_mask = CArray(list(map( lambda tstmp: datetime.strptime(tstmp, fmt) <= self.th_timestamp, timestamps))) # Test set samples are all the other samples ts_mask = tr_mask.logical_not() # Compute the number of train/test samples max_tr = tr_mask.sum() max_ts = dataset.num_samples - max_tr if max_tr == 0: raise ValueError("no samples with timestamp <= {:}. " "Cannot split dataset.".format(self.th_timestamp)) if max_ts == 0: raise ValueError("no samples with timestamp > {:}. " "Cannot split dataset.".format(self.th_timestamp)) # Compute the actual number of desired train/test samples if is_int(self.train_size): if self.train_size < 1 or self.train_size > max_tr: raise ValueError( "train_size should be between 1 and {:}".format(max_tr)) else: # train_size is a valid integer, use it directly tr_size = self.train_size else: # Compute the proportion of train samples (at least 1) tr_size = int(max(1, round(max_tr * self.train_size))) if is_int(self.test_size): if self.test_size < 1 or self.test_size > max_ts: raise ValueError( "test_size should be between 1 and {:}".format(max_ts)) else: # test_size is a valid integer, use it directly ts_size = self.test_size else: # Compute the proportion of train samples (at least 1) ts_size = int(max(1, round(max_ts * self.test_size))) # Get the indices of samples from boolean masks tr_idx = CArray(tr_mask.find(tr_mask)) ts_idx = CArray(ts_mask.find(ts_mask)) # Get the subset of indices to include in train/test set # If shuffle is True, randomize the indices if self.shuffle is True: tr_idx = CArray.randsample( tr_idx, shape=(tr_size, ), random_state=self.random_state) ts_idx = CArray.randsample( ts_idx, shape=(ts_size, ), random_state=self.random_state) else: # Just slice the arrays of indices tr_idx = tr_idx[:tr_size] ts_idx = ts_idx[:ts_size] self._tr_idx = tr_idx self._ts_idx = ts_idx return self.tr_idx, self.ts_idx