def main(): # Setup argument parser parser = ArgumentParser() parser.add_argument('pdfs', help='Input PDFs (directory or file with path list)') parser.add_argument('--with-unknown', action='store_true', default=False, help='Display queries for unknown files') parser.add_argument('--with-waiting', action='store_true', default=False, help='Display pending queries') # Process arguments args = parser.parse_args() pdfs = sorted(utility.get_pdfs(args.pdfs)) handler = PdfrateQueryHandler() for pdf in pdfs: report = handler.poll(pdf) if not args.with_unknown and report['status'] == 'unknown': continue if not args.with_waiting and report['status'] == 'waiting': continue sys.stdout.write('{}: {}'.format(report['filename'], report['status'])) if report['status'] in ['success', 'nometadata']: r = max(report['results'].keys()) sys.stdout.write(' [{}%]'.format(report['results'][r] ['contagio_bm'])) sys.stdout.write('\n') return 0
def main(): # Setup argument parser parser = ArgumentParser() parser.add_argument('--mal', help='Malicious PDFs (directory or file with list of paths)') parser.add_argument('--ben', help='Benign PDFs (directory or file with list of paths)') parser.add_argument('csv', help='Resulting CSV file') # Process arguments args = parser.parse_args() pdfs_mal, pdfs_ben = [], [] if args.mal: pdfs_mal = sorted(utility.get_pdfs(args.mal)) if args.ben: pdfs_ben = sorted(utility.get_pdfs(args.ben)) extract_features(pdfs_ben, pdfs_mal, args.csv) return 0
def get_FTC_mimicry(): ''' Returns a numpy.array of size (number of samples, number of features) with feature values of all mimicry attack results in the FTC scenario. ''' pdfs = utility.get_pdfs(config.get('results', 'FTC_mimicry')) if not pdfs: # Generate the attack files attack_mimicry('FTC') pdfs = utility.get_pdfs(config.get('results', 'FTC_mimicry')) print 'Loading feature vectors from mimicry attack results...' results = numpy.zeros((len(pdfs), FeatureDescriptor.get_feature_count())) for i in range(len(pdfs)): results[i, ] = FeatureEdit(pdfs[i]).retrieve_feature_vector_numpy() return results, [1.0 for i in range(len(pdfs))]
def get_FTC_mimicry(): ''' Returns a numpy.array of size (number of samples, number of features) with feature values of all mimicry attack results in the FTC scenario. ''' pdfs = utility.get_pdfs(config.get('results', 'FTC_mimicry')) if not pdfs: # Generate the attack files attack_mimicry('FTC') pdfs = utility.get_pdfs(config.get('results', 'FTC_mimicry')) print 'Loading feature vectors from mimicry attack results...' results = numpy.zeros((len(pdfs), FeatureDescriptor.get_feature_count())) for i in range(len(pdfs)): results[i,] = FeatureEdit(pdfs[i]).retrieve_feature_vector_numpy() return results, [1.0 for i in range(len(pdfs))]
def main(): # Setup argument parser parser = ArgumentParser() parser.add_argument( '--mal', help='Malicious PDFs (directory or file with list of paths)') parser.add_argument( '--ben', help='Benign PDFs (directory or file with list of paths)') parser.add_argument('csv', help='Resulting CSV file') # Process arguments args = parser.parse_args() pdfs_mal, pdfs_ben = [], [] if args.mal: pdfs_mal = sorted(utility.get_pdfs(args.mal)) if args.ben: pdfs_ben = sorted(utility.get_pdfs(args.ben)) extract_features(pdfs_ben, pdfs_mal, args.csv) return 0
def main(): # Setup argument parser parser = ArgumentParser() parser.add_argument('pdfs', help='Input PDFs (directory or file with path list)') parser.add_argument('--priority', type=int, default=0, help='Submission priority') # Process arguments args = parser.parse_args() pdfs = sorted(utility.get_pdfs(args.pdfs)) sys.stdout.write("Submitting files to PDFrate:\n\n") handler = PdfrateQueryHandler() for i, pdf in enumerate(pdfs): handler.submit_query(pdf, get_metadata=True, priority=args.priority) sys.stderr.write('{d} File {i}/{n}: {f} {d}\n' .format(d='-'*10, i=i+1, n=len(pdfs), f=pdf)) return 0
def attack_mimicry(scenario_name, plot=False): ''' Invokes the mimcry attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the mimicry attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_mimicry'.format(scenario_name)) # Make results reproducible random.seed(0) # Load benign files print 'Loading attack targets from file "{}"'.format(scenario['targets']) target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets']) targets = zip(target_paths, target_vectors) # Load malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from file "{}"'.format(wolves) malicious = sorted(utility.get_pdfs(wolves)) if not malicious: _attack_files_missing(wolves) # Set up classifier classifier = 0 if scenario['classifier'] == 'rf': classifier = RandomForest() print 'ATTACKING RANDOM FOREST' elif scenario['classifier'] == 'svm': classifier = sklearn_SVC() print 'ATTACKING SVM' print 'Loading model from "{}"'.format(scenario['model']) classifier.load_model(scenario['model']) # Standardize data points if necessary scaler = None if 'scaled' in scenario['model']: scaler = pickle.load(open(config.get('datasets', 'contagio_scaler'))) print 'Using scaler' # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(mal, targets, classifier, scaler) for mal in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)): if isinstance(res, Exception): print res continue (target_path, mimic_path, mimic_score, wolf_score) = res print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score) print ' BEST: {p} [{s}]'.format(p=target_path, s=mimic_score) if path.dirname(mimic_path) != output_dir: print ' Moving best to {}\n'.format( path.join(output_dir, path.basename(wolf_path))) shutil.move(mimic_path, path.join(output_dir, path.basename(wolf_path))) if plot: pyplot.plot([wolf_score, mimic_score]) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('Mimicry attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_ylabel('Classifier score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)
def attack_gdkde(scenario_name, plot=False): ''' Invokes the GD-KDE attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the GD-KDE attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_gdkde'.format(scenario_name)) # Make results reproducible random.seed(0) # Load and print malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from "{}"'.format(wolves) malicious = utility.get_pdfs(wolves) if not malicious: _attack_files_missing(wolves) # Load an SVM trained with scaled data scaler = pickle.load(open(config.get('datasets', 'contagio_scaler'))) print 'Using scaler' svm = sklearn_SVC() print 'Loading model from "{}"'.format(scenario['model']) svm.load_model(scenario['model']) # Load the training data used for kernel density estimation print 'Loading dataset from file "{}"'.format(scenario['training']) X_train, y_train, _ = datasets.csv2numpy(scenario['training']) # Subsample for faster execution ind_sample = random.sample(range(len(y_train)), 500) X_train = X_train[ind_sample, :] y_train = y_train[ind_sample] # Set parameters kde_reg = 10 kde_width = 50 step = 1 max_iter = 50 # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(svm, fname, scaler, X_train, y_train, kde_reg, kde_width, step, max_iter, False) for fname in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for res, oldf in zip(pool.imap(_gdkde_wrapper, pargs), malicious): if isinstance(res, Exception): print res continue (_, fseq, _, _, attack_file) = res print 'Processing file "{}":'.format(oldf) print ' scores: {}'.format(', '.join([str(s) for s in fseq])) print 'Result: "{}"'.format(attack_file) if path.dirname(attack_file) != output_dir: shutil.move(attack_file, output_dir) if plot: pyplot.plot(fseq, label=oldf) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('GD-KDE attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_xlim(0, max_iter + 1) axes.set_ylabel('SVM score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)
def attack_mimicry(scenario_name, plot=False): ''' Invokes the mimcry attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' # Configuration (should be mofified here) target_files = "replace with the path to the .csv of the pdfrateR benign files in the training data (without standardizartion)" is_binary = "set it to be true if pdfrateB is to be evaluated" scaler_path = "replaced with the path to the scaler" training_data_path = "replaced with the path to .csv of the standardized training data of pdfrateR" X_train, y_train, _ = datasets.csv2numpy(training_data_path) test_data_path = "replaced with the path to .csv of the standardized training data of pdfrateB" X_test, y_test, _ = datasets.csv2numpy(test_data_path) n_trial = 30 model_path = "Replaced with the path to the model to be evaluated" print 'Running the mimicry attack...' #print("First train and test the target classifier") #classifier = SVC(kernel='rbf', C=10, gamma=0.01) #classifier.fit(X_train, y_train) #y_pred = classifier.predict(X_test) #print(confusion_matrix(y_pred, y_test)) print("First load the classifier") classifier = pickle.load(open(model_path, 'r')) scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_mimicry'.format(scenario_name)) # Make results reproducible random.seed(0) # Get the most 30 benign files ben_file_paths = target_files X_ben, y_ben, ben_paths = datasets.csv2numpy(ben_file_paths) scaler = pickle.load(open(scaler_path)) X_ben_copy = copy.deepcopy(X_ben) X_ben_scaled = scaler.transform(X_ben) y_score_ben = classifier.decision_function(X_ben_scaled) most_ben_ind = y_score_ben.argsort()[:n_trial] y_score_most_ben = y_score_ben[most_ben_ind] print(most_ben_ind) print("The ave score of the most 30 benign: {}".format( numpy.mean(y_score_most_ben))) print("The ave score of the benign: {}".format(numpy.mean(y_score_ben))) target_vectors = X_ben_copy[most_ben_ind] target_paths = list(numpy.array(ben_paths)[most_ben_ind]) # Load benign files print("Loading the most benign files as targets") #print 'Loading attack targets from file "{}"'.format(target_files) #target_vectors, _, target_paths = datasets.csv2numpy(target_files) targets = zip(target_paths, target_vectors) # Load malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from file "{}"'.format(wolves) malicious = sorted(utility.get_pdfs(wolves)) if not malicious: _attack_files_missing(wolves) # Standardize data points if necessary scaler = None if 'scaled' in scenario['model']: scaler = pickle.load(open(scaler_path)) print 'Using scaler' # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(mal, targets, classifier, scaler, is_binary) for mal in malicious] print 'Running the attack...' for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)): if isinstance(res, Exception): print res continue (target_path, mimic_path, mimic_score, wolf_score) = res ''' print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score) print ' BEST: {p} [{s}]'.format(p=target_path, s=mimic_score) if path.dirname(mimic_path) != output_dir: print ' Moving best to {}\n'.format(path.join(output_dir, path.basename(mimic_path))) shutil.move(mimic_path, output_dir) ''' print 'Saved resulting attack files to {}'.format(output_dir)
def attack_mimicry(scenario_name, plot=False): ''' Invokes the mimcry attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the mimicry attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_mimicry'.format(scenario_name)) # Make results reproducible random.seed(0) # Load benign files print 'Loading attack targets from file "{}"'.format(scenario['targets']) target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets']) targets = zip(target_paths, target_vectors) # Load malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from file "{}"'.format(wolves) malicious = sorted(utility.get_pdfs(wolves)) if not malicious: _attack_files_missing(wolves) # Set up classifier classifier = 0 if scenario['classifier'] == 'rf': classifier = RandomForest() print 'ATTACKING RANDOM FOREST' elif scenario['classifier'] == 'svm': classifier = sklearn_SVC() print 'ATTACKING SVM' print 'Loading model from "{}"'.format(scenario['model']) classifier.load_model(scenario['model']) # Standardize data points if necessary scaler = None if 'scaled' in scenario['model']: scaler = pickle.load(open(config.get('datasets', 'contagio_scaler'))) print 'Using scaler' # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(mal, targets, classifier, scaler) for mal in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)): if isinstance(res, Exception): print res continue (target_path, mimic_path, mimic_score, wolf_score) = res print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score) print ' BEST: {p} [{s}]'.format(p=target_path, s=mimic_score) if path.dirname(mimic_path) != output_dir: print ' Moving best to {}\n'.format(path.join(output_dir, path.basename(mimic_path))) shutil.move(mimic_path, output_dir) if plot: pyplot.plot([wolf_score, mimic_score]) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('Mimicry attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_ylabel('Classifier score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)
def attack_gdkde(scenario_name, plot=False): ''' Invokes the GD-KDE attack for the given scenario and saves the resulting attack files in the location specified by the configuration file. If plot evaluates to True, saves the resulting plot into the specified file, otherwise shows the plot in a window. ''' print 'Running the GD-KDE attack...' _initialize() scenario = _scenarios[scenario_name] output_dir = config.get('results', '{}_gdkde'.format(scenario_name)) # Make results reproducible random.seed(0) # Load and print malicious files wolves = config.get('experiments', 'contagio_attack_pdfs') if not path.exists(wolves): _attack_files_missing(wolves) print 'Loading attack samples from "{}"'.format(wolves) malicious = utility.get_pdfs(wolves) if not malicious: _attack_files_missing(wolves) # Load an SVM trained with scaled data scaler = pickle.load(open( config.get('datasets', 'contagio_scaler'))) print 'Using scaler' svm = sklearn_SVC() print 'Loading model from "{}"'.format(scenario['model']) svm.load_model(scenario['model']) # Load the training data used for kernel density estimation print 'Loading dataset from file "{}"'.format(scenario['training']) X_train, y_train, _ = datasets.csv2numpy(scenario['training']) # Subsample for faster execution ind_sample = random.sample(range(len(y_train)), 500) X_train = X_train[ind_sample, :] y_train = y_train[ind_sample] # Set parameters kde_reg = 10 kde_width = 50 step = 1 max_iter = 50 # Set up multiprocessing pool = multiprocessing.Pool() pargs = [(svm, fname, scaler, X_train, y_train, kde_reg, kde_width, step, max_iter, False) for fname in malicious] if plot: pyplot.figure(1) print 'Running the attack...' for res, oldf in zip(pool.imap(_gdkde_wrapper, pargs), malicious): if isinstance(res, Exception): print res continue (_, fseq, _, _, attack_file) = res print 'Processing file "{}":'.format(oldf) print ' scores: {}'.format(', '.join([str(s) for s in fseq])) print 'Result: "{}"'.format(attack_file) if path.dirname(attack_file) != output_dir: shutil.move(attack_file, output_dir) if plot: pyplot.plot(fseq, label=oldf) print 'Saved resulting attack files to {}'.format(output_dir) if plot: pyplot.title('GD-KDE attack') axes = pyplot.axes() axes.set_xlabel('Iterations') axes.set_xlim(0, max_iter + 1) axes.set_ylabel('SVM score') axes.yaxis.grid() fig = pyplot.gcf() fig.set_size_inches(6, 4.5) fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96) if plot == 'show': pyplot.show() else: pyplot.savefig(plot, dpi=300) print 'Saved plot to file {}'.format(plot)