def _learn_model(scenario_name): ''' Learns a classifier model for the specified scenario if one does not already exist. ''' scenario = _scenarios[scenario_name] if path.exists(scenario['model']): return print 'Training the model for scenario {}...'.format(scenario_name) # Decide on classifier classifier = 0 if scenario['classifier'] == 'rf': classifier = RandomForest() sys.stdout.write('TRAINING RANDOM FOREST\n') cutoff = [c * 0.1 for c in range(1, 10)] elif scenario['classifier'] == 'svm': classifier = sklearn_SVC(kernel='rbf', C=10, gamma=0.01) sys.stdout.write('TRAINING SVM\n') cutoff = [0.0] # Load the required dataset and train the model X, y, _ = datasets.csv2numpy(scenario['training']) classifier.fit(X, y) # Evaluate the model on the training dataset y_pred = classifier.decision_function(X) sys.stdout.write('Performance on training data:\n') utility.print_stats_cutoff(y, y_pred, cutoff) # Save the model in the corresponding file classifier.save_model(scenario['model'])
def test_load_model(self): X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata')) self.svc.fit(X, y) newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata')) prediction = self.svc.predict(newX) self.svc.save_model(config.get('sklearn_SVC_test', 'modelfile')) newsvc = sklearn_SVC() newsvc.load_model(config.get('sklearn_SVC_test', 'modelfile')) self.assertTrue(numpy.array_equal(prediction, newsvc.predict(newX))) os.remove(config.get('sklearn_SVC_test', 'modelfile'))
def attack_mimicry(scenario_name, plot=False): ''' Invokes the mimcry attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the mimicry attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_mimicry'.format(scenario_name)) # Make results reproducible random.seed(0) # Load benign files print 'Loading attack targets from file "{}"'.format(scenario['targets']) target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets']) targets = zip(target_paths, target_vectors) # Load malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from file "{}"'.format(wolves) malicious = sorted(utility.get_pdfs(wolves)) if not malicious: _attack_files_missing(wolves) # Set up classifier classifier = 0 if scenario['classifier'] == 'rf': classifier = RandomForest() print 'ATTACKING RANDOM FOREST' elif scenario['classifier'] == 'svm': classifier = sklearn_SVC() print 'ATTACKING SVM' print 'Loading model from "{}"'.format(scenario['model']) classifier.load_model(scenario['model']) # Standardize data points if necessary scaler = None if 'scaled' in scenario['model']: scaler = pickle.load(open(config.get('datasets', 'contagio_scaler'))) print 'Using scaler' # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(mal, targets, classifier, scaler) for mal in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)): if isinstance(res, Exception): print res continue (target_path, mimic_path, mimic_score, wolf_score) = res print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score) print ' BEST: {p} [{s}]'.format(p=target_path, s=mimic_score) if path.dirname(mimic_path) != output_dir: print ' Moving best to {}\n'.format( path.join(output_dir, path.basename(wolf_path))) shutil.move(mimic_path, path.join(output_dir, path.basename(wolf_path))) if plot: pyplot.plot([wolf_score, mimic_score]) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('Mimicry attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_ylabel('Classifier score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)
def attack_gdkde(scenario_name, plot=False): ''' Invokes the GD-KDE attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the GD-KDE attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_gdkde'.format(scenario_name)) # Make results reproducible random.seed(0) # Load and print malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from "{}"'.format(wolves) malicious = utility.get_pdfs(wolves) if not malicious: _attack_files_missing(wolves) # Load an SVM trained with scaled data scaler = pickle.load(open(config.get('datasets', 'contagio_scaler'))) print 'Using scaler' svm = sklearn_SVC() print 'Loading model from "{}"'.format(scenario['model']) svm.load_model(scenario['model']) # Load the training data used for kernel density estimation print 'Loading dataset from file "{}"'.format(scenario['training']) X_train, y_train, _ = datasets.csv2numpy(scenario['training']) # Subsample for faster execution ind_sample = random.sample(range(len(y_train)), 500) X_train = X_train[ind_sample, :] y_train = y_train[ind_sample] # Set parameters kde_reg = 10 kde_width = 50 step = 1 max_iter = 50 # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(svm, fname, scaler, X_train, y_train, kde_reg, kde_width, step, max_iter, False) for fname in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for res, oldf in zip(pool.imap(_gdkde_wrapper, pargs), malicious): if isinstance(res, Exception): print res continue (_, fseq, _, _, attack_file) = res print 'Processing file "{}":'.format(oldf) print ' scores: {}'.format(', '.join([str(s) for s in fseq])) print 'Result: "{}"'.format(attack_file) if path.dirname(attack_file) != output_dir: shutil.move(attack_file, output_dir) if plot: pyplot.plot(fseq, label=oldf) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('GD-KDE attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_xlim(0, max_iter + 1) axes.set_ylabel('SVM score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)
def attack_mimicry(scenario_name, plot=False): ''' Invokes the mimcry attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the mimicry attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_mimicry'.format(scenario_name)) # Make results reproducible random.seed(0) # Load benign files print 'Loading attack targets from file "{}"'.format(scenario['targets']) target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets']) targets = zip(target_paths, target_vectors) # Load malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from file "{}"'.format(wolves) malicious = sorted(utility.get_pdfs(wolves)) if not malicious: _attack_files_missing(wolves) # Set up classifier classifier = 0 if scenario['classifier'] == 'rf': classifier = RandomForest() print 'ATTACKING RANDOM FOREST' elif scenario['classifier'] == 'svm': classifier = sklearn_SVC() print 'ATTACKING SVM' print 'Loading model from "{}"'.format(scenario['model']) classifier.load_model(scenario['model']) # Standardize data points if necessary scaler = None if 'scaled' in scenario['model']: scaler = pickle.load(open(config.get('datasets', 'contagio_scaler'))) print 'Using scaler' # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(mal, targets, classifier, scaler) for mal in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)): if isinstance(res, Exception): print res continue (target_path, mimic_path, mimic_score, wolf_score) = res print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score) print ' BEST: {p} [{s}]'.format(p=target_path, s=mimic_score) if path.dirname(mimic_path) != output_dir: print ' Moving best to {}\n'.format(path.join(output_dir, path.basename(mimic_path))) shutil.move(mimic_path, output_dir) if plot: pyplot.plot([wolf_score, mimic_score]) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('Mimicry attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_ylabel('Classifier score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)
def attack_gdkde(scenario_name, plot=False): ''' Invokes the GD-KDE attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the GD-KDE attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_gdkde'.format(scenario_name)) # Make results reproducible random.seed(0) # Load and print malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from "{}"'.format(wolves) malicious = utility.get_pdfs(wolves) if not malicious: _attack_files_missing(wolves) # Load an SVM trained with scaled data scaler = pickle.load(open( config.get('datasets', 'contagio_scaler'))) print 'Using scaler' svm = sklearn_SVC() print 'Loading model from "{}"'.format(scenario['model']) svm.load_model(scenario['model']) # Load the training data used for kernel density estimation print 'Loading dataset from file "{}"'.format(scenario['training']) X_train, y_train, _ = datasets.csv2numpy(scenario['training']) # Subsample for faster execution ind_sample = random.sample(range(len(y_train)), 500) X_train = X_train[ind_sample, :] y_train = y_train[ind_sample] # Set parameters kde_reg = 10 kde_width = 50 step = 1 max_iter = 50 # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(svm, fname, scaler, X_train, y_train, kde_reg, kde_width, step, max_iter, False) for fname in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for res, oldf in zip(pool.imap(_gdkde_wrapper, pargs), malicious): if isinstance(res, Exception): print res continue (_, fseq, _, _, attack_file) = res print 'Processing file "{}":'.format(oldf) print ' scores: {}'.format(', '.join([str(s) for s in fseq])) print 'Result: "{}"'.format(attack_file) if path.dirname(attack_file) != output_dir: shutil.move(attack_file, output_dir) if plot: pyplot.plot(fseq, label=oldf) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('GD-KDE attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_xlim(0, max_iter + 1) axes.set_ylabel('SVM score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)
def test_constructor(self): _ = sklearn_SVC() _ = sklearn_SVC() _ = sklearn_SVC()
def setUp(self): self.svc = sklearn_SVC()