def filtering(dic):
    """Removes features with less then alpha occurences from the dictionary"""
    print('Normalizing Values - Start')
    alpha = 3  # minimum feature occurrece, default 3
    beta = 5  # minimum feature length, default 5
    max_bar = len(dic.keys())
    dic_n = dict(dic)
    incremento = 0
    for key in dic.keys():
        value = dic.get(key)
        if value < alpha or len(key) < beta:
            dic_n.pop(key, None)
        progress_bar.update(max_bar, incremento)
        incremento += 1
    print('Normalizing Values - End')
    return dic_n
def database_train(train_data):
    """Organizes the training set getting all the data"""
    print('Creating Dataset from Training Data - Start')
    global malware_map
    db = {}
    max_bar = len(train_data)
    incremento = 0
    for filename in train_data:
        if filename in malware_map.keys():
            fam = malware_map.get(filename)
        else:
            fam = 'Safe'
        if fam in db.keys():
            dic = analyze_file(filename, db.get(fam))
        else:
            dic = analyze_file(filename, {})
        db.update({fam: dic})
        incremento += 1
        progress_bar.update(max_bar, incremento)
    print('Creating Dataset from Training Data - End')
    return db
def create_dictionary_safe(db_train):
    """All the features of all the safe family in the training set"""
    print('Loading Safe Dictionary - Start')
    db_prefix = {}
    prefixes = db_train.get('Safe')
    max_bar = len(prefixes.keys())
    incremento = 0
    for prefix in prefixes.keys():
        values = prefixes.get(prefix)
        for value in values:
            if value in db_prefix:
                v = db_prefix.get(value)
                v += values.get(value)
            else:
                v = values.get(value)
            db_prefix.update({value: v})
        incremento += 1
        progress_bar.update(max_bar, incremento)
    db_prefix_n = filtering(db_prefix)
    print('Loading Safe Dictionary - End')
    return db_prefix_n
Beispiel #4
0
def determine():
    '''determine all combinations of objects that can
    be used as input into VISUO3D (run.py script) given
    the data gathered into the data/triplets.pickle file 
    output: data/working-combos.csv'''

    print("Determining working combos...")
    dictionary = pickle.load(open("data/triplets.pickle", 'rb'))
    combos = triplets = list(dictionary.keys())  # combos will be a buffer
    pairs = [','.join(key.split(',')[:-1]) for key in triplets]
    maxIter = 4  # max number of items per combination
    sumIter = sum(list({p:min(pairs.count(p)-1,maxIter-1) \
                  for p in pairs if pairs.count(p)>1}.values()))
    pairs = list(set(pairs))

    # set up progress bar
    progress = 0
    longest = max(len(p) for p in pairs)  # longest string in pairs
    progress_bar.update(0, sumIter, prefix='Progress:')

    # open/create file
    wcFile = open("data/working-combos.csv", 'w')
    [wcFile.write(combo + '\n') for combo in combos]  # write triplets set
    combos = []  # clear buffer

    time = startTimer()
    for pair in pairs:
        objs = [t.split(',')[-1] for t in triplets if t.startswith(pair + ',')]
        if len(objs) < 2: continue
        for i in range(2, min(maxIter + 1, len(objs) + 1)):
            current = [
                ','.join([pair] + list(o)) for o in combinations(objs, i)
            ]
            combos.extend(list(set(current)))
            progress_bar.update(progress,sumIter,prefix='Progress:',suffix=pair+ \
                                '... + '+str(int(i))+' '*(longest-len(pair)+8))
            progress += 1
        [wcFile.write(combo + '\n') for combo in combos]  # write to file
        combos = []  # clear buffer
    wcFile.close()

    endMessage = str(progress) + '/' + str(sumIter) + ' Done! ' + '(' + str(
        endTimer(time)) + ' sec)'
    endMessage = endMessage + ' ' * (longest - min(longest, len(endMessage)))
    progress_bar.update(sumIter,
                        sumIter,
                        prefix='Progress:',
                        suffix=endMessage)
                if c_map == 'malware':
                    f_tp += 1
                    tp += 1
                else:
                    f_tn += 1
                    tn += 1
            else:
                if c_map == 'malware':
                    f_fp += 1
                    fp += 1
                else:
                    f_fn += 1
                    fn += 1

            incremento += 1
            progress_bar.update(max_bar, incremento)

        print('True Positive for episode {} is: {}'.format(
            i / fold_size, f_tp))
        print('True Negative for episode {} is: {}'.format(
            i / fold_size, f_tn))
        print('False Positive for episode {} is: {}'.format(
            i / fold_size, f_fp))
        print('False Negative for episode {} is: {}'.format(
            i / fold_size, f_fn))
        f_error_rate = (f_fn + f_fp) / (f_tp + f_tn + f_fp + f_fn)
        print('Error rate for episode {} is: {}'.format(
            i / fold_size, f_error_rate))
        f_accuracy = 1 - f_error_rate
        print('Accuracy for episode {} is: {}'.format(i / fold_size,
                                                      f_accuracy))