Ejemplo n.º 1
0
def perturbate(data, labels, subset, ben_means, ben_devs):
    '''
    Modifies a subset of malicious feature vectors in data. The 
    modified vectors have a subset of their features randomly sampled 
    from a normal distribution with the mean ben_means and standard 
    deviation ben_devs.
    '''
    feat_indices = [
        FeatureDescriptor.get_feature_names().index(feat)
        for feat in common.top_feats
    ]
    num_malicious = int(round(sum(labels)))
    total = int(round(subset * num_malicious))
    indices = set(random.sample(range(num_malicious), total))
    i = mal_i = 0
    while total > 0:
        if labels[i] == 1:
            if mal_i in indices:
                for feat_i in feat_indices:
                    data[i][feat_i] = random.gauss(ben_means[feat_i],
                                                   ben_devs[feat_i])
                total -= 1
            mal_i += 1
        i += 1

    return data
Ejemplo n.º 2
0
def numpy2csv(csv_out, X, y, file_names=None):
    '''
    Creates a CSV file from the given data points (X, scipy matrix) and labels 
    (y, numpy.array). The CSV file has a header. The first column is named 
    'class' and the others after PDFrate features. All features are written 
    in their respective type format (e.g., True/False for booleans). 
    
    If 'csv_out' is an open Python file, it will not be reopened. If 
    it is a string, a file will be created with that name. 
    '''
    we_opened_csvfile = type(csv_out) == str
    csvfile = open(csv_out, 'wb+') if we_opened_csvfile else csv_out
    # Write header
    csvfile.write('class')
    if file_names:
        csvfile.write(',filename')
    names = FeatureDescriptor.get_feature_names()
    for name in names:
        csvfile.write(',{}'.format(name))
    csvfile.write('\n')
    descs = FeatureDescriptor.get_feature_descriptions()
    # Write data
    for i in range(0, X.shape[0]):
        csvfile.write('{}'.format('TRUE' if bool(y[i]) else 'FALSE'))
        if file_names:
            csvfile.write(',{}'.format(file_names[i]))
        for j in range(0, X.shape[1]):
            feat_type = descs[names[j]]['type']
            feat_val = X[i, j]
            if feat_type == bool:
                feat_val = 'TRUE' if feat_val >= 0.5 else 'FALSE'
            elif feat_type == int:
                feat_val = int(round(feat_val))
            csvfile.write(',{}'.format(feat_val))
        csvfile.write('\n')

    if we_opened_csvfile:
        csvfile.close()
Ejemplo n.º 3
0
def get_FTC_mimicry():
    '''
    Returns a numpy.array of size (number of samples, number of 
    features) with feature values of all mimicry attack results in 
    the FTC scenario.
    '''
    pdfs = utility.get_pdfs(config.get('results', 'FTC_mimicry'))
    if not pdfs:
        # Generate the attack files
        attack_mimicry('FTC')
        pdfs = utility.get_pdfs(config.get('results', 'FTC_mimicry'))

    print 'Loading feature vectors from mimicry attack results...'
    results = numpy.zeros((len(pdfs), FeatureDescriptor.get_feature_count()))
    for i in range(len(pdfs)):
        results[i, ] = FeatureEdit(pdfs[i]).retrieve_feature_vector_numpy()

    return results, [1.0 for i in range(len(pdfs))]
Ejemplo n.º 4
0
def get_FTC_mimicry():
    '''
    Returns a numpy.array of size (number of samples, number of 
    features) with feature values of all mimicry attack results in 
    the FTC scenario.
    '''
    pdfs = utility.get_pdfs(config.get('results', 'FTC_mimicry'))
    if not pdfs:
        # Generate the attack files
        attack_mimicry('FTC')
        pdfs = utility.get_pdfs(config.get('results', 'FTC_mimicry'))
    
    print 'Loading feature vectors from mimicry attack results...'
    results = numpy.zeros((len(pdfs), FeatureDescriptor.get_feature_count()))
    for i in range(len(pdfs)):
        results[i,] = FeatureEdit(pdfs[i]).retrieve_feature_vector_numpy()
    
    return results, [1.0 for i in range(len(pdfs))]
Ejemplo n.º 5
0
def csv2numpy(csv_in):
    '''
    Parses a CSV input file and returns a tuple (X, y) with 
    training vectors (numpy.array) and labels (numpy.array), respectfully. 
    
    csv_in - name of a CSV file with training data points; 
                the first column in the file is supposed to be named 
                'class' and should contain the class label for the data 
                points; the second column of this file will be ignored 
                (put data point ID here). 
    '''
    # Parse CSV file
    csv_rows = list(csv.reader(open(csv_in, 'rb')))
    classes = {'FALSE': 0, 'TRUE': 1}
    rownum = 0
    # Count exact number of data points
    TOTAL_ROWS = 0
    for row in csv_rows:
        if row[0] in classes:
            # Count line if it begins with a class label (boolean)
            TOTAL_ROWS += 1
    # X = vector of data points, y = label vector
    X = numpy.array(numpy.zeros(
        (TOTAL_ROWS, FeatureDescriptor.get_feature_count())),
                    dtype=numpy.float64,
                    order='C')
    y = numpy.array(numpy.zeros(TOTAL_ROWS), dtype=numpy.float64, order='C')
    file_names = []
    for row in csv_rows:
        # Skip line if it doesn't begin with a class label (boolean)
        if row[0] not in classes:
            continue
        # Read class label from first row
        y[rownum] = classes[row[0]]
        featnum = 0
        file_names.append(row[1])
        for featval in row[2:]:
            if featval in classes:
                # Convert booleans to integers
                featval = classes[featval]
            X[rownum, featnum] = float(featval)
            featnum += 1
        rownum += 1
    return X, y, file_names
Ejemplo n.º 6
0
def perturbate(data, labels, subset, ben_means, ben_devs):
    """
    Modifies a subset of malicious feature vectors in data. The 
    modified vectors have a subset of their features randomly sampled 
    from a normal distribution with the mean ben_means and standard 
    deviation ben_devs.
    """
    feat_indices = [FeatureDescriptor.get_feature_names().index(feat) for feat in common.top_feats]
    num_malicious = int(round(sum(labels)))
    total = int(round(subset * num_malicious))
    indices = set(random.sample(range(num_malicious), total))
    i = mal_i = 0
    while total > 0:
        if labels[i] == 1:
            if mal_i in indices:
                for feat_i in feat_indices:
                    data[i][feat_i] = random.gauss(ben_means[feat_i], ben_devs[feat_i])
                total -= 1
            mal_i += 1
        i += 1

    return data
Ejemplo n.º 7
0
parser.add_argument('threshold',
                    help="threshold for determining neuron activated",
                    type=float)
parser.add_argument('-t',
                    '--target_model',
                    help="target model that we want it predicts differently",
                    choices=[0, 1, 2],
                    default=0,
                    type=int)

args = parser.parse_args()

X_test, _, names = datasets.csv2numpy('./dataset/test.csv')
X_test = X_test.astype('float32')
num_features = X_test.shape[1]
feat_names = FeatureDescriptor.get_feature_names()
incre_idx, incre_decre_idx = init_feature_constraints(feat_names)

# define input tensor as a placeholder
input_tensor = Input(shape=(num_features, ))

# load multiple models sharing same input tensor
K.set_learning_phase(0)
model1 = Model1(input_tensor=input_tensor, load_weights=True)
model2 = Model2(input_tensor=input_tensor, load_weights=True)
model3 = Model3(input_tensor=input_tensor, load_weights=True)
# init coverage table
model_layer_dict1, model_layer_dict2, model_layer_dict3 = init_coverage_tables(
    model1, model2, model3)

# ==============================================================================================
Ejemplo n.º 8
0
def fig9(tr_vec, tr_labels, te_vec, te_labels, fnames):
    '''
    Reproduction of results published in Table 10 of "Malicious PDF Detection 
    Using Metadata and Structural Features" by Charles Smutz and 
    Angelos Stavrou, ACSAC 2012.
    '''
    print 'Loading random forest classifier...'
    rf = RandomForest()
    rf.load_model(config.get('experiments', 'FTC_model'))
    ben_means, ben_devs = common.get_benign_mean_stddev(tr_vec, tr_labels)
    res = []
    # te_vec will be randomly modified in feature space.
    # f_vec will be randomly modified in feature space but the 
    # randomly generated variables will be adjusted to be 
    # valid for the given feature
    f_vec = te_vec.copy()
    print 'Got {} samples. Modifying them for attack...'.format(len(te_vec))
    print '{:>25s} {:>15s} {:>15s}'.format('Feature name', 'Feature space', 
                                           'Problem space')
    pool = multiprocessing.Pool(processes=None)
    # Modify top features one by one
    for f_name in common.top_feats:
        f_i = FeatureDescriptor.get_feature_names().index(f_name)
        f_desc = FeatureDescriptor.get_feature_description(f_name)
        print '{:>25s}'.format(f_name),
        
        # For all files
        for i in range(len(te_vec)):
            if te_labels[i] != 1:
                # Modify only malicious files
                continue
            
            first_val = True
            while True:
                # Keep randomly generating a new value
                # Stop when it becomes valid for the current feature
                new_val = random.gauss(ben_means[f_i], ben_devs[f_i])
                if first_val:
                    # Make sure we generate random values for te_vec
                    te_vec[i][f_i] = new_val
                    first_val = False
                
                # If not valid, retry 
                if f_desc['type'] == bool:
                    new_val = False if new_val < 0.5 else True
                elif f_desc['type'] == int:
                    new_val = int(round(new_val))
                if f_desc['range'][0] == FileDefined and new_val < 0:
                    continue
                elif (f_desc['range'][0] != FileDefined and 
                        new_val < f_desc['range'][0]):
                    continue
                if f_desc['type'] != bool and f_desc['range'][1] < new_val:
                    continue
                # Valid, win!
                f_vec[i][f_i] = new_val
                break
        
        # mod_data has feature values read from the problem space, 
        # i.e., by converting feature vectors to files and back
        mod_data = f_vec.copy()
        pargs = [(fnames[i], f_vec[i], i) 
                 for i, l in enumerate(te_labels) if l == 1]
        for mimic, m_id in pool.imap(mimicry_wrap, pargs):
                mod_data[m_id] = mimic
        pred = rf.predict(te_vec)
        fspace = accuracy_score(te_labels, pred)
        print '{:>15.3f}'.format(fspace),
        pred = rf.predict(mod_data)
        pspace = accuracy_score(te_labels, pred)
        print '{:>15.3f}'.format(pspace)
        res.append((fspace, pspace))
    return res
Ejemplo n.º 9
0
Implementation of the mimicry attack.

Created on July 1, 2013.
'''

import os
import random
import sys
import numpy

from mimicus.tools.featureedit import FeatureEdit
from mimicus.tools.datasets import numpy2csv, csv2numpy
from mimicus.tools.featureedit import FeatureDescriptor

descs = FeatureDescriptor.get_feature_descriptions()
names = FeatureDescriptor.get_feature_names()


def binarize(X_old):
    X_new = numpy.copy(X_old)
    for i in range(0, X_new.shape[0]):
        for j in range(0, X_new.shape[1]):
            feat_type = descs[names[j]]['type']
            if feat_type != bool:
                if X_new[i][j] != 0:
                    X_new[i][j] = 1
    return X_new


def validate(X_old):
Ejemplo n.º 10
0
def fig9(tr_vec, tr_labels, te_vec, te_labels, fnames):
    '''
    Reproduction of results published in Table 10 of "Malicious PDF Detection 
    Using Metadata and Structural Features" by Charles Smutz and 
    Angelos Stavrou, ACSAC 2012.
    '''
    print 'Loading random forest classifier...'
    rf = RandomForest()
    rf.load_model(config.get('experiments', 'FTC_model'))
    ben_means, ben_devs = common.get_benign_mean_stddev(tr_vec, tr_labels)
    res = []
    # te_vec will be randomly modified in feature space.
    # f_vec will be randomly modified in feature space but the
    # randomly generated variables will be adjusted to be
    # valid for the given feature
    f_vec = te_vec.copy()
    print 'Got {} samples. Modifying them for attack...'.format(len(te_vec))
    print '{:>25s} {:>15s} {:>15s}'.format('Feature name', 'Feature space',
                                           'Problem space')
    pool = multiprocessing.Pool(processes=None)
    # Modify top features one by one
    for f_name in common.top_feats:
        f_i = FeatureDescriptor.get_feature_names().index(f_name)
        f_desc = FeatureDescriptor.get_feature_description(f_name)
        print '{:>25s}'.format(f_name),

        # For all files
        for i in range(len(te_vec)):
            if te_labels[i] != 1:
                # Modify only malicious files
                continue

            first_val = True
            while True:
                # Keep randomly generating a new value
                # Stop when it becomes valid for the current feature
                new_val = random.gauss(ben_means[f_i], ben_devs[f_i])
                if first_val:
                    # Make sure we generate random values for te_vec
                    te_vec[i][f_i] = new_val
                    first_val = False

                # If not valid, retry
                if f_desc['type'] == bool:
                    new_val = False if new_val < 0.5 else True
                elif f_desc['type'] == int:
                    new_val = int(round(new_val))
                if f_desc['range'][0] == FileDefined and new_val < 0:
                    continue
                elif (f_desc['range'][0] != FileDefined
                      and new_val < f_desc['range'][0]):
                    continue
                if f_desc['type'] != bool and f_desc['range'][1] < new_val:
                    continue
                # Valid, win!
                f_vec[i][f_i] = new_val
                break

        # mod_data has feature values read from the problem space,
        # i.e., by converting feature vectors to files and back
        mod_data = f_vec.copy()
        pargs = [(fnames[i], f_vec[i], i) for i, l in enumerate(te_labels)
                 if l == 1]
        for mimic, m_id in pool.imap(mimicry_wrap, pargs):
            mod_data[m_id] = mimic
        pred = rf.predict(te_vec)
        fspace = accuracy_score(te_labels, pred)
        print '{:>15.3f}'.format(fspace),
        pred = rf.predict(mod_data)
        pspace = accuracy_score(te_labels, pred)
        print '{:>15.3f}'.format(pspace)
        res.append((fspace, pspace))
    return res