def get_all_keys(folder, env): total_features = 7 sets = {} for i in range(total_features): sets[i] = set() counter = 0 for root, dirs, files in os.walk(folder): for file_name in files: key = file_name.split('/')[-1] key = key.split('.')[0] file_name = '%s/%s' % (root, file_name) data = np.load(file_name) for fn in range(total_features): feature_set = data.items()[0][1][fn] sets[fn].update(list(feature_set[:, 0])) counter += 1 if counter % 100 == 0: all_featues = sets.items() all_featues.sort(key=lambda x: x[0]) print counter, [len(v) for k, v in all_featues] all_featues = sets.items() all_featues.sort(key=lambda x: x[0]) print 'Results:', [len(v) for k, v in all_featues] sets = [to_array(v) for k, v in all_featues] fname = config.locate_file(env, 'all_featuresets_keys') np.savez_compressed(fname, sets) print 'saved to', fname
def build_model(env, n): max_keys = load_max_keys(env, n) max_keys = max_keys.keys() max_keys.sort() features = [] fname = config.locate_file(env, DATA_FILE % env) with open(fname) as f: for line in f: record = ujson.loads(line.strip()) record = dict([(key_to_str(k), v) for k, v in record]) current = [record['key']] + [record.get(k, 0) for k in max_keys] features.append(current) features = pd.DataFrame(features) fname = config.locate_file(env, OUTPUT_FILE % (n, env)) features.to_csv(fname) print 'Saved to', fname
def calc_top_n_ngrams(env, n): counter = Counter() fname = config.locate_file(env, DATA_FILE % env) with open(fname) as f: for line in f: record = ujson.loads(line.strip()) for k, value in record: if k == 'key': continue k = key_to_str(k) counter[k] = counter[k] + 1 counter = counter.items() counter.sort(key=lambda v: v[1], reverse=True) counter = dict(counter[:n]) fname = config.locate_file(env, MAX_NGRAMS_FILE % n) with open(fname, 'w') as f: f.write(ujson.dumps(counter)) print 'Saved to', fname
def constuct_n_model_top_n(env, n, folder, all_sets, top_n_fn, postfix): print 'Constructing top n model', n, postfix all_keys = top_n_fn(n) print 'Max key number', len(all_keys) model = construct_n_model_using_keys(env, n, folder, all_sets, all_keys) model = pd.DataFrame(model) fname = config.locate_file(env, 'opcodes_%s_model_%s.pd' % (n, postfix)) model.to_csv(fname) print 'saved to file', fname
def constuct_n_model(env, n, folder, all_sets, force_str, postfix, known_vals=None): print 'Constructing featureset', n all_keys = list(all_sets[n - 1]) if isinstance(all_keys[0], np.ndarray): all_keys = [tuple(k) for k in all_keys] all_keys.sort() model = construct_n_model_using_keys(env, n, folder, all_sets, all_keys, force_str, known_vals) model = pd.DataFrame(model) fname = config.locate_file(env, 'opcodes_%s_model_%s.pd' % (n, postfix)) model.to_csv(fname) print 'saved to', fname
def top_by_freq(env, n, top_n): keys = [] fname = config.locate_file(env, 'featureset_stats_%s.pkl' % n) with open(fname) as f: stats = pickle.load(f) freqs = compute_freq(stats) for i in range(NUM_CLASSES): if i in freqs: class_freqs = list(freqs[i]) class_freqs.sort(key=lambda x: x[1], reverse=True) keys.extend(class_freqs[: top_n]) keys = list(set(keys)) keys.sort() return keys
def count_n_fset_stats(env, n, folder): print 'Counting featureset stats', n labels = pd.read_csv(config.conf[env]['labels']) labels.set_index('Id', inplace=True) sum_per_class = {} count_per_class = {} counter = 0 for root, dirs, files in os.walk(folder): for file_name in files: key = file_name.split('/')[-1] key = key.split('.')[0] cls = int(labels.ix[key].Class) - 1 file_name = '%s/%s' % (root, file_name) data = np.load(file_name) fset = dict(data.items()[0][1][n - 1]) sum_d = sum_per_class.get(cls, {}) count_d = count_per_class.get(cls, {}) for k, v in fset.iteritems(): if k in sum_d: sum_d[k] = sum_d[k] + v count_d[k] = count_d[k] + 1 else: sum_d[k] = v count_d[k] = 1 sum_per_class[cls] = sum_d count_per_class[cls] = count_d counter += 1 if counter % 100 == 0: print 'processed', counter total = {'sum': sum_per_class, 'count': count_per_class} fname = config.locate_file(env, 'featureset_stats_%s.pkl' % n) with open(fname, 'w') as f: pickle.dump(total, f)
def top_unique(env, n, top_n): # keys = [] fname = config.locate_file(env, 'featureset_stats_%s.pkl' % n) with open(fname) as f: stats = pickle.load(f) all_keys = {} for i in range(NUM_CLASSES): for k in stats['count'][i].iterkeys(): if k not in all_keys: all_keys[k] = 1 else: all_keys[k] = all_keys[k] + 1 unique = set([k for k, v in all_keys.iteritems() if v == 1]) columns = [] for i in range(NUM_CLASSES): counts = [(k, v) for k, v in stats['count'][i].iteritems() if k in unique] counts.sort(key=lambda x: x[1], reverse=True) columns.extend([k for k, v in counts[:top_n]]) return columns
rows = [] for row in crunch(input_, ".bytes", one_grams, limit=limit): rows.append(row) pd.DataFrame(rows).to_csv(output) print "Saved to", output def bytes_2_grams(input_, output, limit): print "Processing", input_ with open(output, "w") as f: for row in crunch(input_, ".bytes", two_grams, limit=limit): f.write(ujson.dumps(row)) f.write("\n") print "Saved to", output if __name__ == "__main__": env = sys.argv[1].lower() limit = config.conf[env].get("limit", None) one_gr_file = ONE_GR_FILE.format(env=env) bytes_1_grams(config.conf[env]["input"], config.locate_file(env, one_gr_file), limit) two_gr_file = TWO_GR_FILE.format(env=env) bytes_2_grams(config.conf[env]["input"], config.locate_file(env, two_gr_file), limit)
def load_max_keys(env, n): fname = config.locate_file(env, MAX_NGRAMS_FILE % n) with open(fname) as f: return ujson.loads(''.join(f.readlines()))
import numpy as np import pe_parser from sevenz_cruncher import crunch import config import sys def process_asm(key, lines, folder): features = pe_parser.parse(lines) np.savez_compressed(folder % key, features) return 1 if __name__ == '__main__': env = sys.argv[1] folder = config.locate_file(env, config.conf[env]['asm_folder']) limit = config.conf[env].get('limit', None) crunch(config.conf[env]['input'], '.asm', lambda key, lines: process_asm(key, lines, folder + '/%s'), limit=limit)
# keys = fn(n) # else: # keys = list(all_sets[n - 1]) # mappings[name] = [(k, i) for i, k in enumerate(keys)] # save_json('all_mappings.txt', mappings) def save_json(fname, object_): with open(fname, 'w') as f: f.write(ujson.dumps(object_)) print 'Saved to', fname if __name__ == '__main__': env = sys.argv[1] folder = config.locate_file(env, config.conf[env]['asm_folder']) if config.conf[env]['calc_stats']: get_all_keys(folder, env) fname = config.locate_file(env, 'all_featuresets_keys.npz') all_sets = np.load(fname).items()[0][1] constuct_n_model(env, 1, folder, all_sets, False, env) if config.conf[env]['calc_stats']: count_n_fset_stats(env, 2, folder) count_n_fset_stats(env, 3, folder) count_n_fset_stats(env, 4, folder) count_n_fset_stats(env, 5, folder)
for row in crunch(input_, '.bytes', one_grams, limit=limit): rows.append(row) pd.DataFrame(rows).to_csv(output) print 'Saved to', output def bytes_2_grams(input_, output, limit): print 'Processing', input_ with open(output, 'w') as f: for row in crunch(input_, '.bytes', two_grams, limit=limit): f.write(ujson.dumps(row)) f.write('\n') print 'Saved to', output if __name__ == '__main__': env = sys.argv[1].lower() limit = config.conf[env].get('limit', None) one_gr_file = ONE_GR_FILE.format(env=env) bytes_1_grams(config.conf[env]['input'], config.locate_file(env, one_gr_file), limit) two_gr_file = TWO_GR_FILE.format(env=env) bytes_2_grams(config.conf[env]['input'], config.locate_file(env, two_gr_file), limit)