def clean_exit(error_code, message, kill_generator=False): """ Performs a clean exit, useful for when errors happen that can't be recovered from.""" logger.log_critical(module_name, message) if kill_generator: generator.stop_generator(2) logger.log_stop() sys.exit(error_code)
def test_reader(): from sys import argv, exit import traceback if len(argv) < 4: print(argv[0], '<input_file>', '<memory_file>', '<bin_dir>') exit(0) logger.log_start(logging.DEBUG) try: ofile = tempfile.mkstemp(text=True) ofilefd = fdopen(ofile[0], 'w') mem_map = read_memory_file(argv[2]) for tuple in disasm_pt_file(argv[1], argv[3], mem_map): if tuple is None: break ofilefd.write(str(tuple) + "\n") ofilefd.close() except: traceback.print_exc() ofilefd.close() remove(ofile[1]) logger.log_stop() exit(1) logger.log_info(module_name, 'Wrote generated tuples to ' + str(ofile[1])) logger.log_stop()
def test_generator(): from sys import argv, exit import reader import tempfile if len(argv) < 5: print(argv[0], '<input_file>', '<bin_dir>', '<memory_file>', '<seq_len>') exit(0) logger.log_start(logging.DEBUG) try: ofile = tempfile.mkstemp(text=True) ofilefd = os.fdopen(ofile[0], 'w') filters.set_filters(['ret']) memory = reader.read_memory_file(argv[3]) input, output = start_generator(2, reader.disasm_pt_file, seq_len=int(argv[4], 10)) input.put((None, argv[1], argv[2], memory)) while True: try: res = output.get(True, 5) except queue.Empty: count = get_in_service() if get_in_service() == 0: break else: logger.log_debug( module_name, str(count) + ' workers still working on jobs') continue ofilefd.write(str(res[0]) + ": " + str(res[1]) + "\n") stop_generator(10) ofilefd.close() except: traceback.print_exc() ofilefd.close() os.remove(ofile[1]) logger.log_stop() exit(1) logger.log_info(module_name, 'Wrote generated tuples to ' + str(ofile[1])) logger.log_stop()
def clean_exit(error_code, message): """ Performs a clean exit, useful for when errors happen that can't be recovered from.""" logger.log_critical(MODULE_NAME, message) logger.log_stop() sys.exit(error_code)
if not options.skip_test: logger.log_info(MODULE_NAME, 'Starting testing') try: test_model(sets_meta['b_test']) except KeyboardInterrupt: clean_exit(EXIT_USER_INTERRUPT, 'Keyboard interrupt, cleaning up...') except: clean_exit(EXIT_RUNTIME_ERROR, "Unexpected error:\n" + str(traceback.format_exc())) else: logger.log_info(MODULE_NAME, 'Skipping testing') # Evaluating model if not options.skip_eval: logger.log_info(MODULE_NAME, 'Starting evaluation') try: eval_model(sets_meta['b_test'] + sets_meta['m_test']) except KeyboardInterrupt: clean_exit(EXIT_USER_INTERRUPT, 'Keyboard interrupt, cleaning up...') except: clean_exit(EXIT_RUNTIME_ERROR, "Unexpected error:\n" + str(traceback.format_exc())) else: logger.log_info(MODULE_NAME, 'Skipping evaluation') # Cleanup logger.log_info(MODULE_NAME, 'Cleaning up and exiting') logger.log_stop()
def main(): global edges, max_seq # Parse input arguments parser = OptionParser( usage='Usage: %prog [options] pt_trace_dir output_file') parser.add_option('-r', '--parse-ret', action='store_true', dest='parse_ret', help='Consider returns') parser.add_option('-c', '--parse-icall', action='store_true', dest='parse_icall', help='Consider indirect calls') parser.add_option('-j', '--parse-ijmp', action='store_true', dest='parse_ijmp', help='Consider indirect jumps') parser.add_option('-s', '--sequence-length', action='store', dest='max_seq', type='int', default=32, help='Max sequence length to calculate (default: 32)') options, args = parser.parse_args() if len(args) != 2: parser.print_help() sys.stdout.write("\n Note: Only preprocessed traces are supported\n") sys.exit(1) trace_filepath = os.path.join(args[0], 'trace_parsed.gz') opath = args[1] max_seq = options.max_seq # Input validation if not os.path.isfile(trace_filepath): sys.stderr.write('Error: ' + str(trace_filepath) + " either does not exist or is not a file\n") sys.exit(1) if options.parse_ret: filters.add_filter('ret') if options.parse_icall: filters.add_filter('icall') if options.parse_ijmp: filters.add_filter('ijmp') if filters.get_num_enabled() == 0: sys.stderr.write( "Error: Must specify at least one thing to learn (-r, -c, -j)\n") sys.exit(1) # Initialization logger.log_start(20) history = list() # History of past basic blocks # edges is a three-level dictionary where the keys for the first layer are sequence length, # the keys for the second layer are source BBID(s), and the keys for the third layer is # destination BBID. The value is count (i.e., how many times that (src, dst) pair has occurred. edges = dict() for seq_len in range(1, max_seq + 1): edges[seq_len] = dict() # Parsing logger.log_info(module_name, 'Parsing trace') for tuple in reader.read_preprocessed(trace_filepath): if tuple is None: break # End of trace src_bbid, dst_bbid, instr = tuple[:3] # Update history history.append(src_bbid) if len(history) > max_seq: history.pop(0) if not True in [func(tuple) for func in filters.enabled_filters]: continue for seq_len in range(1, min(len(history), max_seq) + 1): insert(history[-seq_len:], dst_bbid) # Distribution of how many possible destinations sources have, up to df_max destinations. logger.log_info(module_name, 'Calculating distributions') df_max = 100 df = np.zeros((max_seq, df_max), dtype=int) for seq_len in range(1, max_seq + 1): for src_bbid in edges[seq_len]: dst_size = len(edges[seq_len][src_bbid].keys()) if dst_size <= df_max: df[seq_len - 1][dst_size - 1] += 1 else: df[seq_len - 1][df_max - 1] += 1 # Save statistics logger.log_info(module_name, 'Saving statistics to ' + str(opath)) with open(opath, 'w') as ofile: # Header ofile.write('seq_len,' + ','.join([str(x) for x in range(1, df_max + 1)]) + "\n") # Data for seq_len in range(1, max_seq + 1): ofile.write( str(seq_len) + ',' + ','.join([str(x) for x in df[seq_len - 1]]) + "\n") # Cleanup logger.log_stop()
def main(): """Main""" global threshold parser = OptionParser(usage='Usage: %prog [options] eval_dir', version='Barnum Classifier ' + module_version) parser.add_option('-f', '--force', action='store_true', help='Force threshold to produce no false positives (benign classified as malicious)') parser.add_option('-s', '--save', action='store', type='str', default=None, help='Save classifier to given filepath (default: no saving)') parser.add_option('-l', '--load', action='store', type='str', default=None, help='Use a previously saved classifier instead of making a new one') parser.add_option('-c', '--csv', action='store', type='str', default=None, help='Save CSV of results to given filepath (default: no CSV)') parser.add_option('-p', '--plot', action='store', type='str', default=None, help='Save plot as a PNG image to the given filepath (default: no plotting)') parser.add_option('-r', '--roc', action='store', type='str', default=None, help='Save CSV plotting ROC curve to filepath (default: not saved)') parser.add_option('-w', '--workers', action='store', dest='workers', type='int', default=cpu_count(), help='Number of workers to use (default: number of cores)') parser.add_option('-i', '--ignore-cache', action='store_true', help='Do not use caching') options, args = parser.parse_args() if len(args) != 1 or options.workers < 1: parser.print_help() sys.exit(ERROR_INVALID_ARG) logger.log_start(20) logger.log_info(module_name, 'Barnum Classifier ' + module_version) idirpath = args[0] if not os.path.isdir(idirpath): logger.log_error(module_name, 'ERROR: ' + idirpath + " is not a directory") logger.log_stop() sys.exit(ERROR_INVALID_ARG) files = [os.path.join(idirpath, f) for f in os.listdir(idirpath) if os.path.isfile(os.path.join(idirpath, f))] num_benign = len([fp for fp in files if 'benign' in os.path.basename(fp)]) num_malicious = len([fp for fp in files if 'malicious' in os.path.basename(fp)]) if options.load is None and (num_benign == 0 or num_malicious == 0): logger.log_error(module_name, "Need at least 1 malicious and 1 benign sample to train a classifier") logger.log_stop() sys.exit(ERROR_INVALID_ARG) if not options.roc is None and (num_benign == 0 or num_malicious == 0): logger.log_error(module_name, "Need at least 1 malicious and 1 benign sample to plot a ROC curve") logger.log_stop() sys.exit(ERROR_INVALID_ARG) if not options.ignore_cache: init_cache() # Calculate average accuracy and confidence for each sample logger.log_info(module_name, "Parsing " + idirpath) pool = Pool(options.workers) data = [sample for sample in pool.map(parse_file, zip(files, [options] * len(files))) if sample[0] < 2] pool.close() ys = np.array([sample[0] for sample in data]) xs = np.array([sample[1:3] for sample in data]) if options.load is None: logger.log_info(module_name, "Creating classifier") # Train a new classifier from scratch if options.force: # Use ADASYN to over sample the benign class until FP falls to 0 warnings.filterwarnings("ignore", module="imblearn") fp = 1.0 ben_cnt = len([y for y in ys if y == 0]) mal_cnt = len(ys) - ben_cnt ben_step = max(1, int(ben_cnt * 0.1)) while fp > 0.0: ben_cnt += ben_step try: xs_os, ys_os = ADASYN({0: ben_cnt, 1: mal_cnt}, n_jobs=options.workers).fit_resample(xs, ys) except ValueError: continue # Happens if change in counts produces too little change in ratio svm = SVC(kernel='linear') svm.fit(xs_os, ys_os) results = [[sample, svm.predict([sample[1:3]])] for sample in data] benign = [sample for sample in results if sample[0][0] == 0] fps = [sample for sample in results if sample[0][0] == 0 and sample[1] == 1] fp = float(len(fps)) / float(len(benign)) else: svm = SVC(kernel='linear') svm.fit(xs, ys) else: # Use a previously saved classifier logger.log_info(module_name, "Loading classifier from " + options.load) try: svm = joblib.load(options.load) nu = None except Exception as ex: logger.log_error(module_name, "Failed to load classifier: " + str(ex)) logger.log_stop() sys.exit(ERROR_RUNTIME) # Metrics results = [[sample, svm.predict([sample[1:3]])] for sample in data] benign = [sample for sample in results if sample[0][0] == 0] malicious = [sample for sample in results if sample[0][0] == 1] fps = [sample for sample in results if sample[0][0] == 0 and sample[1] == 1] fns = [sample for sample in results if sample[0][0] == 1 and sample[1] == 0] if len(benign) > 0: fp = float(len(fps)) / float(len(benign)) else: fp = 'N/A' if len(malicious) > 0: fn = float(len(fns)) / float(len(malicious)) else: fn = 'N/A' logger.log_info(module_name, "----------") logger.log_info(module_name, "FP: " + str(fp)) logger.log_info(module_name, "FN: " + str(fn)) logger.log_info(module_name, "----------") # Saving CSV if not options.csv is None: logger.log_info(module_name, "Saving CSV to " + options.csv) try: with open(options.csv, 'w') as csv_file: csv_file.write("true_label,pred_label,avg_accuracy,avg_confidence,name\n") for result in results: csv_file.write(','.join([str(result[0][0]), str(result[1][0]), str(result[0][1]), str(result[0][2]), result[0][3]]) + "\n") except Exception as ex: module.log_error(module_name, "Failed to save CSV: " + str(ex)) # Saving Classifier if not options.save is None: logger.log_info(module_name, "Saving classifier to " + options.save) try: joblib.dump(svm, options.save) except: logger.log_error(module_name, "Failed to save classifier to " + options.save) # Plotting if not options.plot is None: logger.log_info(module_name, "Saving plot to " + options.plot) axes = plt.gca() axes.set_xlim([0, 1]) axes.set_ylim([0, 1]) w = svm.coef_[0] a = -w[0] / w[1] xx = np.linspace(0, 1) yy = a * xx - (svm.intercept_[0]) / w[1] plt.scatter([sample[0][1] for sample in benign], [sample[0][2] for sample in benign], marker='o', c='blue', s=20) plt.scatter([sample[0][1] for sample in malicious], [sample[0][2] for sample in malicious], marker='x', c='red', s=20) plt.plot(xx, yy, 'k--') plt.xlabel('Wrong Prediction (%)') plt.ylabel('Average Confidence (%)') try: plt.savefig(options.plot) except: logger.log_error(module_name, "Failed to save plot") # ROC if not options.roc is None: logger.log_info(module_name, "Saving ROC to " + options.roc) make_roc(options.roc, data, svm) logger.log_stop()
def main(): # Parse input arguments parser = OptionParser( usage='Usage: %prog [options] trace_directory bin_directory') parser.add_option( '-f', '--force', action='store_true', help='If a complete or partial output already exists, overwrite it.') parser.add_option( '-t', '--timeout', action='store', type='int', default=None, help='Max seconds to run before quitting (default: infinite).') parser.add_option( '-p', '--no-partial', action='store_true', help='If timeout is reached, do not save the partially parsed trace.') options, args = parser.parse_args() if len(args) < 2: parser.print_help() sys.exit(0) data_dir = args[0] bin_dir = args[1] logger.log_start(logging.INFO) # Input validation if not os.path.isdir(data_dir): logger.log_error(module_name, data_dir + ' is not a directory') logger.log_stop() sys.exit(1) if not os.path.isdir(bin_dir): logger.log_error(module_name, bin_dir + ' is not a directory') logger.log_stop() sys.exit(1) if options.timeout is None and options.no_partial: logger.log_warning( module_name, "Setting --no-partial without --timeout does nothing") # Make sure all the expected files are there mem_file = None trace_file = None files = os.listdir(data_dir) for file in files: if file == 'mapping.txt' or file == 'mapping.txt.gz': mem_file = os.path.join(data_dir, file) elif file == 'trace_0' or file == 'trace_0.gz': trace_file = os.path.join(data_dir, file) if mem_file is None: logger.log_error( module_name, 'Could not find mapping.txt or mapping.txt.gz in ' + data_dir) logger.log_stop() sys.exit(1) if trace_file is None: logger.log_error(module_name, 'Could not find trace_0 or trace_0.gz in ' + data_dir) logger.log_stop() sys.exit(1) # Parse the memory file mem_map = reader.read_memory_file(mem_file) if mem_map is None: logger.log_error(module_name, 'Failed to parse memory mapping file') logger.log_stop() sys.exit(1) # We're ready to parse the trace o_filepath = os.path.join(data_dir, 'trace_parsed.gz') if os.path.isfile(o_filepath) and not options.force: logger.log_error(module_name, 'Preprocess file already exists') logger.log_stop() sys.exit(1) if os.path.isfile(o_filepath + '.part') and not options.force: logger.log_error(module_name, 'Partial preprocess file already exists') logger.log_stop() sys.exit(1) entries = 0 with gzip.open(o_filepath + '.part', 'wb') as ofile: for instr in reader.disasm_pt_file(trace_file, bin_dir, mem_map, options.timeout): if instr is None: break ofile.write(pack_instr(instr)) entries += 1 if reader.DISASM_TIMEOUT.is_set() and options.no_partial: logger.log_info(module_name, "Deleting partial trace") os.remove(o_filepath + '.part') elif entries > 0: os.rename(o_filepath + '.part', o_filepath) else: logger.log_error(module_name, 'No output produced, empty file') os.remove(o_filepath + '.part') logger.log_stop()
def main(): """Main""" parser = OptionParser(usage='Usage: %prog [options] eval_dir', version='Barnum Cluster ' + module_version) parser.add_option( '-c', '--csv', action='store', type='str', default=None, help='Save CSV of results to given filepath (default: no CSV)') parser.add_option( '-p', '--plot', action='store', type='str', default=None, help= 'Save plot as a PNG image to the given filepath (default: no plotting)' ) parser.add_option( '-w', '--workers', action='store', dest='workers', type='int', default=cpu_count(), help='Number of workers to use (default: number of cores)') parser.add_option('--max-classes', action='store', type='int', default=256, help='How many classes to use (default: 256)') parser.add_option( '--min-samples', action='store', type='int', default=4, help='Minimum samples to form a cluster in DBSCAN (default: 4)') parser.add_option('--eps', action='store', type='float', default=0.03, help='Epsilon parameter to DBSCAN (default: 0.03)') options, args = parser.parse_args() if len(args) != 1 or options.workers < 1: parser.print_help() sys.exit(ERROR_INVALID_ARG) logger.log_start(20) logger.log_info(module_name, 'Barnum Cluster %s' % module_version) idirpath = args[0] if not os.path.isdir(idirpath): logger.log_error(module_name, 'ERROR: %s is not a directory' % idirpath) logger.log_stop() sys.exit(ERROR_INVALID_ARG) files = [ os.path.join(idirpath, f) for f in os.listdir(idirpath) if os.path.isfile(os.path.join(idirpath, f)) ] # We only care about clustering malicious traces mal_files = [fp for fp in files if 'malicious' in os.path.basename(fp)] num_mal = len(mal_files) # Calculate clustering metrics logger.log_info(module_name, "Parsing " + idirpath) pool = Pool(options.workers) data = [ sample for sample in pool.map( parse_file, zip(mal_files, [options.max_classes] * num_mal)) if sample ] pool.close() xs = np.array([sample[0] for sample in data]) ns = [sample[1] for sample in data] # Clustering logger.log_info(module_name, "Calculating clusters") db = DBSCAN(eps=options.eps, min_samples=options.min_samples).fit(xs) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noise = list(labels).count(-1) logger.log_info(module_name, ' Number of points: %d' % len(ns)) logger.log_info(module_name, ' Number of clusters: %d' % n_clusters) logger.log_info(module_name, 'Number of noise points: %d' % n_noise) # Saving results as CSV if not options.csv is None: logger.log_info(module_name, "Saving CSV to %s" % options.csv) try: with open(options.csv, 'w') as csv_file: csv_file.write("cluster,filename\n") for label, name in zip(labels, ns): csv_file.write(','.join([str(label), name]) + "\n") except Exception as ex: logger.log_error(module_name, "Failed to save CSV: %s" % str(ex)) # Saving results as plot image if not options.plot is None: logger.log_info(module_name, "Generating plot") theta = radar_factory(options.max_classes, frame='polygon') fig, axes = plt.subplots(subplot_kw=dict(projection='radar')) colors = ['b', 'r', 'g', 'm', 'y'] axes.set_varlabels([""]) # no varlabels, they aren't that meaningful axes.set_rgrids([0.2, 0.4, 0.6, 0.8]) legend_labels = list() for label_key in set(labels): if label_key == -1: continue # noise legend_labels.append(label_key) label_color = colors[label_key % len(colors)] # Calculate per-cluster average label_mask = (labels == label_key) label_points = xs[label_mask & core_samples_mask] label_means = np.mean(label_points, axis=0) axes.plot(theta, label_means, color=label_color) axes.fill(theta, label_means, facecolor=label_color, alpha=0.25) # Legend legend = axes.legend(legend_labels, loc=(0.9, .95), labelspacing=0.1, fontsize='small') try: plt.savefig(options.plot) except: logger.log_error(module_name, "Failed to save plot") logger.log_stop()