def attack_mimicry(scenario_name, plot=False): ''' Invokes the mimcry attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the mimicry attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_mimicry'.format(scenario_name)) # Make results reproducible random.seed(0) # Load benign files print 'Loading attack targets from file "{}"'.format(scenario['targets']) target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets']) targets = zip(target_paths, target_vectors) # Load malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from file "{}"'.format(wolves) malicious = sorted(utility.get_pdfs(wolves)) if not malicious: _attack_files_missing(wolves) # Set up classifier classifier = 0 if scenario['classifier'] == 'rf': classifier = RandomForest() print 'ATTACKING RANDOM FOREST' elif scenario['classifier'] == 'svm': classifier = sklearn_SVC() print 'ATTACKING SVM' print 'Loading model from "{}"'.format(scenario['model']) classifier.load_model(scenario['model']) # Standardize data points if necessary scaler = None if 'scaled' in scenario['model']: scaler = pickle.load(open(config.get('datasets', 'contagio_scaler'))) print 'Using scaler' # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(mal, targets, classifier, scaler) for mal in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)): if isinstance(res, Exception): print res continue (target_path, mimic_path, mimic_score, wolf_score) = res print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score) print ' BEST: {p} [{s}]'.format(p=target_path, s=mimic_score) if path.dirname(mimic_path) != output_dir: print ' Moving best to {}\n'.format( path.join(output_dir, path.basename(wolf_path))) shutil.move(mimic_path, path.join(output_dir, path.basename(wolf_path))) if plot: pyplot.plot([wolf_score, mimic_score]) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('Mimicry attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_ylabel('Classifier score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)
def attack_mimicry(scenario_name, plot=False): ''' Invokes the mimcry attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the mimicry attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_mimicry'.format(scenario_name)) # Make results reproducible random.seed(0) # Load benign files print 'Loading attack targets from file "{}"'.format(scenario['targets']) target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets']) targets = zip(target_paths, target_vectors) # Load malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from file "{}"'.format(wolves) malicious = sorted(utility.get_pdfs(wolves)) if not malicious: _attack_files_missing(wolves) # Set up classifier classifier = 0 if scenario['classifier'] == 'rf': classifier = RandomForest() print 'ATTACKING RANDOM FOREST' elif scenario['classifier'] == 'svm': classifier = sklearn_SVC() print 'ATTACKING SVM' print 'Loading model from "{}"'.format(scenario['model']) classifier.load_model(scenario['model']) # Standardize data points if necessary scaler = None if 'scaled' in scenario['model']: scaler = pickle.load(open(config.get('datasets', 'contagio_scaler'))) print 'Using scaler' # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(mal, targets, classifier, scaler) for mal in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)): if isinstance(res, Exception): print res continue (target_path, mimic_path, mimic_score, wolf_score) = res print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score) print ' BEST: {p} [{s}]'.format(p=target_path, s=mimic_score) if path.dirname(mimic_path) != output_dir: print ' Moving best to {}\n'.format(path.join(output_dir, path.basename(mimic_path))) shutil.move(mimic_path, output_dir) if plot: pyplot.plot([wolf_score, mimic_score]) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('Mimicry attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_ylabel('Classifier score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)
def fig9(tr_vec, tr_labels, te_vec, te_labels, fnames): ''' Reproduction of results published in Table 10 of "Malicious PDF Detection Using Metadata and Structural Features" by Charles Smutz and Angelos Stavrou, ACSAC 2012. ''' print 'Loading random forest classifier...' rf = RandomForest() rf.load_model(config.get('experiments', 'FTC_model')) ben_means, ben_devs = common.get_benign_mean_stddev(tr_vec, tr_labels) res = [] # te_vec will be randomly modified in feature space. # f_vec will be randomly modified in feature space but the # randomly generated variables will be adjusted to be # valid for the given feature f_vec = te_vec.copy() print 'Got {} samples. Modifying them for attack...'.format(len(te_vec)) print '{:>25s} {:>15s} {:>15s}'.format('Feature name', 'Feature space', 'Problem space') pool = multiprocessing.Pool(processes=None) # Modify top features one by one for f_name in common.top_feats: f_i = FeatureDescriptor.get_feature_names().index(f_name) f_desc = FeatureDescriptor.get_feature_description(f_name) print '{:>25s}'.format(f_name), # For all files for i in range(len(te_vec)): if te_labels[i] != 1: # Modify only malicious files continue first_val = True while True: # Keep randomly generating a new value # Stop when it becomes valid for the current feature new_val = random.gauss(ben_means[f_i], ben_devs[f_i]) if first_val: # Make sure we generate random values for te_vec te_vec[i][f_i] = new_val first_val = False # If not valid, retry if f_desc['type'] == bool: new_val = False if new_val < 0.5 else True elif f_desc['type'] == int: new_val = int(round(new_val)) if f_desc['range'][0] == FileDefined and new_val < 0: continue elif (f_desc['range'][0] != FileDefined and new_val < f_desc['range'][0]): continue if f_desc['type'] != bool and f_desc['range'][1] < new_val: continue # Valid, win! f_vec[i][f_i] = new_val break # mod_data has feature values read from the problem space, # i.e., by converting feature vectors to files and back mod_data = f_vec.copy() pargs = [(fnames[i], f_vec[i], i) for i, l in enumerate(te_labels) if l == 1] for mimic, m_id in pool.imap(mimicry_wrap, pargs): mod_data[m_id] = mimic pred = rf.predict(te_vec) fspace = accuracy_score(te_labels, pred) print '{:>15.3f}'.format(fspace), pred = rf.predict(mod_data) pspace = accuracy_score(te_labels, pred) print '{:>15.3f}'.format(pspace) res.append((fspace, pspace)) return res