Beispiel #1
0
def get_all_keys(folder, env):
    total_features = 7
    sets = {}
    for i in range(total_features):
        sets[i] = set()

    counter = 0
    for root, dirs, files in os.walk(folder):
        for file_name in files:
            key = file_name.split('/')[-1]
            key = key.split('.')[0]
            file_name = '%s/%s' % (root, file_name)

            data = np.load(file_name)

            for fn in range(total_features):
                feature_set = data.items()[0][1][fn]
                sets[fn].update(list(feature_set[:, 0]))

            counter += 1

            if counter % 100 == 0:
                all_featues = sets.items()
                all_featues.sort(key=lambda x: x[0])
                print counter, [len(v) for k, v in all_featues]

    all_featues = sets.items()
    all_featues.sort(key=lambda x: x[0])
    print 'Results:', [len(v) for k, v in all_featues]

    sets = [to_array(v) for k, v in all_featues]

    fname = config.locate_file(env, 'all_featuresets_keys')
    np.savez_compressed(fname, sets)
    print 'saved to', fname
def build_model(env, n):
    max_keys = load_max_keys(env, n)
    max_keys = max_keys.keys()
    max_keys.sort()

    features = []
    fname = config.locate_file(env, DATA_FILE % env)
    with open(fname) as f:
        for line in f:
            record = ujson.loads(line.strip())
            record = dict([(key_to_str(k), v) for k, v in record])

            current = [record['key']] + [record.get(k, 0) for k in max_keys]
            features.append(current)

    features = pd.DataFrame(features)
    fname = config.locate_file(env, OUTPUT_FILE % (n, env))
    features.to_csv(fname)
    print 'Saved to', fname
def build_model(env, n):
    max_keys = load_max_keys(env, n)
    max_keys = max_keys.keys()
    max_keys.sort()

    features = []
    fname = config.locate_file(env, DATA_FILE % env)
    with open(fname) as f:
        for line in f:
            record = ujson.loads(line.strip())
            record = dict([(key_to_str(k), v) for k, v in record])

            current = [record['key']] + [record.get(k, 0) for k in max_keys]
            features.append(current)

    features = pd.DataFrame(features)
    fname = config.locate_file(env, OUTPUT_FILE % (n, env))
    features.to_csv(fname)
    print 'Saved to', fname
def calc_top_n_ngrams(env, n):
    counter = Counter()

    fname = config.locate_file(env, DATA_FILE % env)
    with open(fname) as f:
        for line in f:
            record = ujson.loads(line.strip())
            for k, value in record:
                if k == 'key':
                    continue

                k = key_to_str(k)
                counter[k] = counter[k] + 1
    counter = counter.items()
    counter.sort(key=lambda v: v[1], reverse=True)
    counter = dict(counter[:n])

    fname = config.locate_file(env, MAX_NGRAMS_FILE % n)
    with open(fname, 'w') as f:
        f.write(ujson.dumps(counter))
        print 'Saved to', fname
def calc_top_n_ngrams(env, n):
    counter = Counter()

    fname = config.locate_file(env, DATA_FILE % env)
    with open(fname) as f:
        for line in f:
            record = ujson.loads(line.strip())
            for k, value in record:
                if k == 'key':
                    continue

                k = key_to_str(k)
                counter[k] = counter[k] + 1
    counter = counter.items()
    counter.sort(key=lambda v: v[1], reverse=True)
    counter = dict(counter[:n])

    fname = config.locate_file(env, MAX_NGRAMS_FILE % n)
    with open(fname, 'w') as f:
        f.write(ujson.dumps(counter))
        print 'Saved to', fname
Beispiel #6
0
def constuct_n_model_top_n(env, n, folder, all_sets, top_n_fn, postfix):
    print 'Constructing top n model', n, postfix

    all_keys = top_n_fn(n)

    print 'Max key number', len(all_keys)

    model = construct_n_model_using_keys(env, n, folder, all_sets, all_keys)

    model = pd.DataFrame(model)
    fname = config.locate_file(env, 'opcodes_%s_model_%s.pd' % (n, postfix))
    model.to_csv(fname)
    print 'saved to file', fname
Beispiel #7
0
def constuct_n_model(env, n, folder, all_sets, force_str, postfix,
                     known_vals=None):
    print 'Constructing featureset', n

    all_keys = list(all_sets[n - 1])
    if isinstance(all_keys[0], np.ndarray):
        all_keys = [tuple(k) for k in all_keys]
    all_keys.sort()

    model = construct_n_model_using_keys(env, n, folder, all_sets, all_keys,
                                         force_str, known_vals)

    model = pd.DataFrame(model)
    fname = config.locate_file(env, 'opcodes_%s_model_%s.pd' % (n, postfix))
    model.to_csv(fname)
    print 'saved to', fname
Beispiel #8
0
def top_by_freq(env, n, top_n):
    keys = []
    fname = config.locate_file(env, 'featureset_stats_%s.pkl' % n)
    with open(fname) as f:
        stats = pickle.load(f)

    freqs = compute_freq(stats)

    for i in range(NUM_CLASSES):
        if i in freqs:
            class_freqs = list(freqs[i])
            class_freqs.sort(key=lambda x: x[1], reverse=True)
            keys.extend(class_freqs[: top_n])

    keys = list(set(keys))
    keys.sort()
    return keys
Beispiel #9
0
def count_n_fset_stats(env, n, folder):
    print 'Counting featureset stats', n

    labels = pd.read_csv(config.conf[env]['labels'])
    labels.set_index('Id', inplace=True)

    sum_per_class = {}
    count_per_class = {}

    counter = 0
    for root, dirs, files in os.walk(folder):
        for file_name in files:
            key = file_name.split('/')[-1]
            key = key.split('.')[0]

            cls = int(labels.ix[key].Class) - 1

            file_name = '%s/%s' % (root, file_name)

            data = np.load(file_name)

            fset = dict(data.items()[0][1][n - 1])

            sum_d = sum_per_class.get(cls, {})
            count_d = count_per_class.get(cls, {})
            for k, v in fset.iteritems():
                if k in sum_d:
                    sum_d[k] = sum_d[k] + v
                    count_d[k] = count_d[k] + 1
                else:
                    sum_d[k] = v
                    count_d[k] = 1
            sum_per_class[cls] = sum_d
            count_per_class[cls] = count_d

            counter += 1

            if counter % 100 == 0:
                print 'processed', counter

    total = {'sum': sum_per_class, 'count': count_per_class}

    fname = config.locate_file(env, 'featureset_stats_%s.pkl' % n)
    with open(fname, 'w') as f:
        pickle.dump(total, f)
Beispiel #10
0
def top_unique(env, n, top_n):
    # keys = []
    fname = config.locate_file(env, 'featureset_stats_%s.pkl' % n)
    with open(fname) as f:
        stats = pickle.load(f)

    all_keys = {}
    for i in range(NUM_CLASSES):
        for k in stats['count'][i].iterkeys():
            if k not in all_keys:
                all_keys[k] = 1
            else:
                all_keys[k] = all_keys[k] + 1

    unique = set([k for k, v in all_keys.iteritems() if v == 1])

    columns = []
    for i in range(NUM_CLASSES):
        counts = [(k, v) for k, v in stats['count'][i].iteritems() if k in unique]
        counts.sort(key=lambda x: x[1], reverse=True)

        columns.extend([k for k, v in counts[:top_n]])

    return columns
    rows = []
    for row in crunch(input_, ".bytes", one_grams, limit=limit):
        rows.append(row)

    pd.DataFrame(rows).to_csv(output)
    print "Saved to", output


def bytes_2_grams(input_, output, limit):
    print "Processing", input_

    with open(output, "w") as f:
        for row in crunch(input_, ".bytes", two_grams, limit=limit):
            f.write(ujson.dumps(row))
            f.write("\n")

    print "Saved to", output


if __name__ == "__main__":
    env = sys.argv[1].lower()

    limit = config.conf[env].get("limit", None)

    one_gr_file = ONE_GR_FILE.format(env=env)
    bytes_1_grams(config.conf[env]["input"], config.locate_file(env, one_gr_file), limit)

    two_gr_file = TWO_GR_FILE.format(env=env)
    bytes_2_grams(config.conf[env]["input"], config.locate_file(env, two_gr_file), limit)
def load_max_keys(env, n):
    fname = config.locate_file(env, MAX_NGRAMS_FILE % n)
    with open(fname) as f:
        return ujson.loads(''.join(f.readlines()))
import numpy as np
import pe_parser
from sevenz_cruncher import crunch
import config
import sys

def process_asm(key, lines, folder):
    features = pe_parser.parse(lines)
    np.savez_compressed(folder % key, features)
    return 1

if __name__ == '__main__':
    env = sys.argv[1]

    folder = config.locate_file(env, config.conf[env]['asm_folder'])
    limit = config.conf[env].get('limit', None)
    crunch(config.conf[env]['input'], '.asm',
           lambda key, lines: process_asm(key, lines, folder + '/%s'),
           limit=limit)
Beispiel #14
0
import numpy as np
import pe_parser
from sevenz_cruncher import crunch
import config
import sys


def process_asm(key, lines, folder):
    features = pe_parser.parse(lines)
    np.savez_compressed(folder % key, features)
    return 1


if __name__ == '__main__':
    env = sys.argv[1]

    folder = config.locate_file(env, config.conf[env]['asm_folder'])
    limit = config.conf[env].get('limit', None)
    crunch(config.conf[env]['input'],
           '.asm',
           lambda key, lines: process_asm(key, lines, folder + '/%s'),
           limit=limit)
Beispiel #15
0
#             keys = fn(n)
#         else:
#             keys = list(all_sets[n - 1])

#         mappings[name] = [(k, i) for i, k in enumerate(keys)]

#     save_json('all_mappings.txt', mappings)

def save_json(fname, object_):
    with open(fname, 'w') as f:
        f.write(ujson.dumps(object_))
    print 'Saved to', fname

if __name__ == '__main__':
    env = sys.argv[1]
    folder = config.locate_file(env, config.conf[env]['asm_folder'])

    if config.conf[env]['calc_stats']:
        get_all_keys(folder, env)

    fname = config.locate_file(env, 'all_featuresets_keys.npz')
    all_sets = np.load(fname).items()[0][1]

    constuct_n_model(env, 1, folder, all_sets, False, env)

    if config.conf[env]['calc_stats']:
        count_n_fset_stats(env, 2, folder)
        count_n_fset_stats(env, 3, folder)
        count_n_fset_stats(env, 4, folder)
        count_n_fset_stats(env, 5, folder)
def load_max_keys(env, n):
    fname = config.locate_file(env, MAX_NGRAMS_FILE % n)
    with open(fname) as f:
        return ujson.loads(''.join(f.readlines()))
    for row in crunch(input_, '.bytes', one_grams, limit=limit):
        rows.append(row)

    pd.DataFrame(rows).to_csv(output)
    print 'Saved to', output


def bytes_2_grams(input_, output, limit):
    print 'Processing', input_

    with open(output, 'w') as f:
        for row in crunch(input_, '.bytes', two_grams, limit=limit):
            f.write(ujson.dumps(row))
            f.write('\n')

    print 'Saved to', output


if __name__ == '__main__':
    env = sys.argv[1].lower()

    limit = config.conf[env].get('limit', None)

    one_gr_file = ONE_GR_FILE.format(env=env)
    bytes_1_grams(config.conf[env]['input'],
                  config.locate_file(env, one_gr_file), limit)

    two_gr_file = TWO_GR_FILE.format(env=env)
    bytes_2_grams(config.conf[env]['input'],
                  config.locate_file(env, two_gr_file), limit)