def evaluate_classifier(data, labels, test_data, test_labels):
    '''
    Returns the classification accuracies of the RandomForest 
    classifier trained on (data, labels) and tested on a list of 
    (test_data, test_labels). 
    '''
    rf = RandomForest()
    rf.fit(data, labels)
    accs = []
    for ted, tel in zip(test_data, test_labels):
        pred = rf.predict(ted)
        accs.append(accuracy_score(tel, pred))
    return accs
Beispiel #2
0
def evaluate_classifier(data, labels, test_data, test_labels):
    '''
    Returns the classification accuracies of the RandomForest 
    classifier trained on (data, labels) and tested on a list of 
    (test_data, test_labels). 
    '''
    rf = RandomForest()
    rf.fit(data, labels)
    accs = []
    for ted, tel in zip(test_data, test_labels):
        pred = rf.predict(ted)
        accs.append(accuracy_score(tel, pred))
    return accs
def _learn_model(scenario_name):
    '''
    Learns a classifier model for the specified scenario if one does 
    not already exist. 
    '''
    scenario = _scenarios[scenario_name]
    if path.exists(scenario['model']):
        return

    print 'Training the model for scenario {}...'.format(scenario_name)
    # Decide on classifier
    classifier = 0
    if scenario['classifier'] == 'rf':
        classifier = RandomForest()
        sys.stdout.write('TRAINING RANDOM FOREST\n')
        cutoff = [c * 0.1 for c in range(1, 10)]
    elif scenario['classifier'] == 'svm':
        classifier = sklearn_SVC(kernel='rbf', C=10, gamma=0.01)
        sys.stdout.write('TRAINING SVM\n')
        cutoff = [0.0]

    # Load the required dataset and train the model
    X, y, _ = datasets.csv2numpy(scenario['training'])
    classifier.fit(X, y)

    # Evaluate the model on the training dataset
    y_pred = classifier.decision_function(X)
    sys.stdout.write('Performance on training data:\n')
    utility.print_stats_cutoff(y, y_pred, cutoff)

    # Save the model in the corresponding file
    classifier.save_model(scenario['model'])
Beispiel #4
0
def _learn_model(scenario_name):
    '''
    Learns a classifier model for the specified scenario if one does 
    not already exist. 
    '''
    scenario = _scenarios[scenario_name]
    if path.exists(scenario['model']):
        return
    
    print 'Training the model for scenario {}...'.format(scenario_name)
    # Decide on classifier
    classifier = 0
    if scenario['classifier'] == 'rf':
        classifier = RandomForest()
        sys.stdout.write('TRAINING RANDOM FOREST\n')
        cutoff = [c * 0.1 for c in range(1, 10)]
    elif scenario['classifier'] == 'svm':
        classifier = sklearn_SVC(kernel='rbf', C=10, gamma=0.01)
        sys.stdout.write('TRAINING SVM\n')
        cutoff = [0.0]
    
    # Load the required dataset and train the model
    X, y, _ = datasets.csv2numpy(scenario['training'])
    classifier.fit(X, y)
    
    # Evaluate the model on the training dataset
    y_pred = classifier.decision_function(X)
    sys.stdout.write('Performance on training data:\n')
    utility.print_stats_cutoff(y, y_pred, cutoff)
    
    # Save the model in the corresponding file
    classifier.save_model(scenario['model'])
def attack_mimicry(scenario_name, plot=False):
    '''
    Invokes the mimcry attack for the given scenario and saves the 
    resulting attack files in the location specified by the 
    configuration file. If plot evaluates to True, saves the resulting 
    plot into the specified file, otherwise shows the plot in a window. 
    '''
    print 'Running the mimicry attack...'
    _initialize()
    scenario = _scenarios[scenario_name]
    output_dir = config.get('results', '{}_mimicry'.format(scenario_name))
    # Make results reproducible
    random.seed(0)
    # Load benign files
    print 'Loading attack targets from file "{}"'.format(scenario['targets'])
    target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets'])
    targets = zip(target_paths, target_vectors)
    # Load malicious files
    wolves = config.get('experiments', 'contagio_attack_pdfs')
    if not path.exists(wolves):
        _attack_files_missing(wolves)
    print 'Loading attack samples from file "{}"'.format(wolves)
    malicious = sorted(utility.get_pdfs(wolves))
    if not malicious:
        _attack_files_missing(wolves)

    # Set up classifier
    classifier = 0
    if scenario['classifier'] == 'rf':
        classifier = RandomForest()
        print 'ATTACKING RANDOM FOREST'
    elif scenario['classifier'] == 'svm':
        classifier = sklearn_SVC()
        print 'ATTACKING SVM'
    print 'Loading model from "{}"'.format(scenario['model'])
    classifier.load_model(scenario['model'])

    # Standardize data points if necessary
    scaler = None
    if 'scaled' in scenario['model']:
        scaler = pickle.load(open(config.get('datasets', 'contagio_scaler')))
        print 'Using scaler'

    # Set up multiprocessing
    pool = multiprocessing.Pool()
    pargs = [(mal, targets, classifier, scaler) for mal in malicious]

    if plot:
        pyplot.figure(1)
    print 'Running the attack...'
    for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)):
        if isinstance(res, Exception):
            print res
            continue
        (target_path, mimic_path, mimic_score, wolf_score) = res
        print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score)
        print '  BEST: {p} [{s}]'.format(p=target_path, s=mimic_score)
        if path.dirname(mimic_path) != output_dir:
            print '  Moving best to {}\n'.format(
                path.join(output_dir, path.basename(wolf_path)))
            shutil.move(mimic_path,
                        path.join(output_dir, path.basename(wolf_path)))
        if plot:
            pyplot.plot([wolf_score, mimic_score])

    print 'Saved resulting attack files to {}'.format(output_dir)

    if plot:
        pyplot.title('Mimicry attack')
        axes = pyplot.axes()
        axes.set_xlabel('Iterations')
        axes.set_ylabel('Classifier score')
        axes.yaxis.grid()
        fig = pyplot.gcf()
        fig.set_size_inches(6, 4.5)
        fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96)
        if plot == 'show':
            pyplot.show()
        else:
            pyplot.savefig(plot, dpi=300)
            print 'Saved plot to file {}'.format(plot)
Beispiel #6
0
def fig9(tr_vec, tr_labels, te_vec, te_labels, fnames):
    '''
    Reproduction of results published in Table 10 of "Malicious PDF Detection 
    Using Metadata and Structural Features" by Charles Smutz and 
    Angelos Stavrou, ACSAC 2012.
    '''
    print 'Loading random forest classifier...'
    rf = RandomForest()
    rf.load_model(config.get('experiments', 'FTC_model'))
    ben_means, ben_devs = common.get_benign_mean_stddev(tr_vec, tr_labels)
    res = []
    # te_vec will be randomly modified in feature space.
    # f_vec will be randomly modified in feature space but the 
    # randomly generated variables will be adjusted to be 
    # valid for the given feature
    f_vec = te_vec.copy()
    print 'Got {} samples. Modifying them for attack...'.format(len(te_vec))
    print '{:>25s} {:>15s} {:>15s}'.format('Feature name', 'Feature space', 
                                           'Problem space')
    pool = multiprocessing.Pool(processes=None)
    # Modify top features one by one
    for f_name in common.top_feats:
        f_i = FeatureDescriptor.get_feature_names().index(f_name)
        f_desc = FeatureDescriptor.get_feature_description(f_name)
        print '{:>25s}'.format(f_name),
        
        # For all files
        for i in range(len(te_vec)):
            if te_labels[i] != 1:
                # Modify only malicious files
                continue
            
            first_val = True
            while True:
                # Keep randomly generating a new value
                # Stop when it becomes valid for the current feature
                new_val = random.gauss(ben_means[f_i], ben_devs[f_i])
                if first_val:
                    # Make sure we generate random values for te_vec
                    te_vec[i][f_i] = new_val
                    first_val = False
                
                # If not valid, retry 
                if f_desc['type'] == bool:
                    new_val = False if new_val < 0.5 else True
                elif f_desc['type'] == int:
                    new_val = int(round(new_val))
                if f_desc['range'][0] == FileDefined and new_val < 0:
                    continue
                elif (f_desc['range'][0] != FileDefined and 
                        new_val < f_desc['range'][0]):
                    continue
                if f_desc['type'] != bool and f_desc['range'][1] < new_val:
                    continue
                # Valid, win!
                f_vec[i][f_i] = new_val
                break
        
        # mod_data has feature values read from the problem space, 
        # i.e., by converting feature vectors to files and back
        mod_data = f_vec.copy()
        pargs = [(fnames[i], f_vec[i], i) 
                 for i, l in enumerate(te_labels) if l == 1]
        for mimic, m_id in pool.imap(mimicry_wrap, pargs):
                mod_data[m_id] = mimic
        pred = rf.predict(te_vec)
        fspace = accuracy_score(te_labels, pred)
        print '{:>15.3f}'.format(fspace),
        pred = rf.predict(mod_data)
        pspace = accuracy_score(te_labels, pred)
        print '{:>15.3f}'.format(pspace)
        res.append((fspace, pspace))
    return res
Beispiel #7
0
def attack_mimicry(scenario_name, plot=False):
    '''
    Invokes the mimcry attack for the given scenario and saves the 
    resulting attack files in the location specified by the 
    configuration file. If plot evaluates to True, saves the resulting 
    plot into the specified file, otherwise shows the plot in a window. 
    '''
    print 'Running the mimicry attack...'
    _initialize()
    scenario = _scenarios[scenario_name]
    output_dir = config.get('results', '{}_mimicry'.format(scenario_name))
    # Make results reproducible
    random.seed(0)
    # Load benign files
    print 'Loading attack targets from file "{}"'.format(scenario['targets'])
    target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets'])
    targets = zip(target_paths, target_vectors)
    # Load malicious files
    wolves = config.get('experiments', 'contagio_attack_pdfs')
    if not path.exists(wolves):
        _attack_files_missing(wolves)
    print 'Loading attack samples from file "{}"'.format(wolves)
    malicious = sorted(utility.get_pdfs(wolves))
    if not malicious:
        _attack_files_missing(wolves)
    
    # Set up classifier
    classifier = 0
    if scenario['classifier'] == 'rf':
        classifier = RandomForest()
        print 'ATTACKING RANDOM FOREST'
    elif scenario['classifier'] == 'svm':
        classifier = sklearn_SVC()
        print 'ATTACKING SVM'
    print 'Loading model from "{}"'.format(scenario['model'])
    classifier.load_model(scenario['model'])
    
    # Standardize data points if necessary
    scaler = None
    if 'scaled' in scenario['model']:
        scaler = pickle.load(open(config.get('datasets', 'contagio_scaler')))
        print 'Using scaler'
    
    # Set up multiprocessing
    pool = multiprocessing.Pool()
    pargs = [(mal, targets, classifier, scaler) for mal in malicious]
    
    if plot:
        pyplot.figure(1)
    print 'Running the attack...'
    for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)):
        if isinstance(res, Exception):
            print res
            continue
        (target_path, mimic_path, mimic_score, wolf_score) = res
        print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score)
        print '  BEST: {p} [{s}]'.format(p=target_path, s=mimic_score)
        if path.dirname(mimic_path) != output_dir:
            print '  Moving best to {}\n'.format(path.join(output_dir, 
                                                 path.basename(mimic_path)))
            shutil.move(mimic_path, output_dir)
        if plot:
            pyplot.plot([wolf_score, mimic_score])
    
    print 'Saved resulting attack files to {}'.format(output_dir)
    
    if plot:
        pyplot.title('Mimicry attack')
        axes = pyplot.axes()
        axes.set_xlabel('Iterations')
        axes.set_ylabel('Classifier score')
        axes.yaxis.grid()
        fig = pyplot.gcf()
        fig.set_size_inches(6, 4.5)
        fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96)
        if plot == 'show':
            pyplot.show()
        else:
            pyplot.savefig(plot, dpi=300)
            print 'Saved plot to file {}'.format(plot)
Beispiel #8
0
def fig9(tr_vec, tr_labels, te_vec, te_labels, fnames):
    '''
    Reproduction of results published in Table 10 of "Malicious PDF Detection 
    Using Metadata and Structural Features" by Charles Smutz and 
    Angelos Stavrou, ACSAC 2012.
    '''
    print 'Loading random forest classifier...'
    rf = RandomForest()
    rf.load_model(config.get('experiments', 'FTC_model'))
    ben_means, ben_devs = common.get_benign_mean_stddev(tr_vec, tr_labels)
    res = []
    # te_vec will be randomly modified in feature space.
    # f_vec will be randomly modified in feature space but the
    # randomly generated variables will be adjusted to be
    # valid for the given feature
    f_vec = te_vec.copy()
    print 'Got {} samples. Modifying them for attack...'.format(len(te_vec))
    print '{:>25s} {:>15s} {:>15s}'.format('Feature name', 'Feature space',
                                           'Problem space')
    pool = multiprocessing.Pool(processes=None)
    # Modify top features one by one
    for f_name in common.top_feats:
        f_i = FeatureDescriptor.get_feature_names().index(f_name)
        f_desc = FeatureDescriptor.get_feature_description(f_name)
        print '{:>25s}'.format(f_name),

        # For all files
        for i in range(len(te_vec)):
            if te_labels[i] != 1:
                # Modify only malicious files
                continue

            first_val = True
            while True:
                # Keep randomly generating a new value
                # Stop when it becomes valid for the current feature
                new_val = random.gauss(ben_means[f_i], ben_devs[f_i])
                if first_val:
                    # Make sure we generate random values for te_vec
                    te_vec[i][f_i] = new_val
                    first_val = False

                # If not valid, retry
                if f_desc['type'] == bool:
                    new_val = False if new_val < 0.5 else True
                elif f_desc['type'] == int:
                    new_val = int(round(new_val))
                if f_desc['range'][0] == FileDefined and new_val < 0:
                    continue
                elif (f_desc['range'][0] != FileDefined
                      and new_val < f_desc['range'][0]):
                    continue
                if f_desc['type'] != bool and f_desc['range'][1] < new_val:
                    continue
                # Valid, win!
                f_vec[i][f_i] = new_val
                break

        # mod_data has feature values read from the problem space,
        # i.e., by converting feature vectors to files and back
        mod_data = f_vec.copy()
        pargs = [(fnames[i], f_vec[i], i) for i, l in enumerate(te_labels)
                 if l == 1]
        for mimic, m_id in pool.imap(mimicry_wrap, pargs):
            mod_data[m_id] = mimic
        pred = rf.predict(te_vec)
        fspace = accuracy_score(te_labels, pred)
        print '{:>15.3f}'.format(fspace),
        pred = rf.predict(mod_data)
        pspace = accuracy_score(te_labels, pred)
        print '{:>15.3f}'.format(pspace)
        res.append((fspace, pspace))
    return res