def test_save_model(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.predict(newX)) == 20)
     self.rf.save_model(config.get('RandomForest_test', 'modelfile'))
     os.remove(config.get('RandomForest_test', 'modelfile'))
Beispiel #2
0
 def test_save_model(self):
     X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata'))
     self.svc.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata'))
     self.assertTrue(len(self.svc.predict(newX)) == 20)
     self.svc.save_model(config.get('sklearn_SVC_test', 'modelfile'))
     os.remove(config.get('sklearn_SVC_test', 'modelfile'))
Beispiel #3
0
 def test_predict(self):
     X, y, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.predict(newX)) == 20)
Beispiel #4
0
 def test_decision_function(self):
     X, y, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.decision_function(newX)) == 20)
Beispiel #5
0
def main():
    random.seed(0)
    parser = ArgumentParser()
    parser.add_argument('--plot',
                        help='Where to save plot (file name)',
                        default=False)
    parser.add_argument('--show',
                        help='Show plot in a window',
                        default=False,
                        action='store_true')
    args = parser.parse_args()

    print 'Loading training data from CSV...'
    tr_data, tr_labels, _ = csv2numpy(config.get('datasets', 'contagio'))

    print 'Loading test data from CSV...'
    te_data, te_labels, te_fnames = csv2numpy(
        config.get('datasets', 'contagio_test'))

    print 'Evaluating...'
    scores = fig9(tr_data, tr_labels, te_data, te_labels, te_fnames)

    if not (args.plot or args.show):
        return 0

    # Plot
    feat_points, file_points = zip(*scores)
    fig = pyplot.figure()
    pyplot.plot(feat_points,
                label='Feature space',
                marker='o',
                color='k',
                linewidth=2)
    pyplot.plot(file_points,
                label='Problem space',
                marker='^',
                color='k',
                linewidth=2,
                linestyle='--')
    axes = fig.gca()

    # Set up axes and labels
    axes.yaxis.set_ticks([r / 10.0 for r in range(11)])
    axes.yaxis.grid()
    axes.set_ylim(0, 1)
    axes.set_ylabel('Accuracy')
    xticklabels = [common.top_feats[0]
                   ] + ['(+) ' + name for name in common.top_feats[1:]]
    axes.set_xticklabels(xticklabels, rotation=60, ha='right')

    fig.subplots_adjust(bottom=0.34, top=0.95, left=0.11, right=0.98)
    pyplot.legend(loc='lower left')
    if args.show:
        pyplot.show()
    if args.plot:
        pyplot.savefig(args.plot, dpi=300, bbox_inches='tight')

    return 0
Beispiel #6
0
 def test_load_model(self):
     X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata'))
     self.svc.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata'))
     prediction = self.svc.predict(newX)
     self.svc.save_model(config.get('sklearn_SVC_test', 'modelfile'))
     newsvc = sklearn_SVC()
     newsvc.load_model(config.get('sklearn_SVC_test', 'modelfile'))
     self.assertTrue(numpy.array_equal(prediction, newsvc.predict(newX)))
     os.remove(config.get('sklearn_SVC_test', 'modelfile'))
 def test_load_model(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata'))
     prediction = self.rf.predict(newX)
     self.rf.save_model(config.get('RandomForest_test', 'modelfile'))
     newrf = RandomForest()
     newrf.load_model(config.get('RandomForest_test', 'modelfile'))
     self.assertTrue(numpy.array_equal(prediction, newrf.predict(newX)))
     os.remove(config.get('RandomForest_test', 'modelfile'))
Beispiel #8
0
def main():
    random.seed(0)
    parser = ArgumentParser()
    parser.add_argument('--plot',
                        help='Where to save plot (file name)',
                        default=False)
    parser.add_argument('--show', help='Show plot in a window', default=False)
    args = parser.parse_args()

    print 'Loading training data from CSV...'
    tr_data, tr_labels, tr_fnames = csv2numpy(
        config.get('datasets', 'contagio'))

    print 'Loading test data from CSV...'
    te_data, te_labels, _ = csv2numpy(config.get('datasets', 'contagio_test'))

    print 'Evaluating...'
    scores = fig11(tr_data, tr_labels, te_data, te_labels, tr_fnames)

    if not (args.plot or args.show):
        return 0

    # Plot
    original, our_mimicry = zip(*scores)
    fig = pyplot.figure()
    pyplot.plot(original,
                label='Clean data',
                marker='o',
                color='k',
                linewidth=2)
    pyplot.plot(our_mimicry,
                label='Our mimicry',
                marker='+',
                color='k',
                linewidth=2,
                linestyle=':')
    axes = fig.gca()

    # Set up axes and labels
    axes.yaxis.set_ticks([r / 10.0 for r in range(11)])
    axes.yaxis.grid()
    axes.set_ylim(0, 1)
    axes.set_ylabel('Accuracy')
    xticklabels = ['0', '0.05', '0.1', '0.5', '1', '5', '10', '50', '100']
    axes.set_xticklabels(xticklabels, rotation=0)
    axes.set_xlabel('Training set perturbation (%)')

    fig.subplots_adjust(bottom=0.13, top=0.95, left=0.11, right=0.96)
    pyplot.legend(loc='lower right')
    if args.show:
        pyplot.show()
    if args.plot:
        pyplot.savefig(args.plot, dpi=300, bbox_inches='tight')

    return 0
Beispiel #9
0
 def test_load_model(self):
     X, y, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'noveldata'))
     prediction = self.rf.predict(newX)
     self.rf.save_model(config.get('RandomForest_test', 'modelfile'))
     newrf = RandomForest()
     newrf.load_model(config.get('RandomForest_test', 'modelfile'))
     self.assertTrue(numpy.array_equal(prediction, newrf.predict(newX)))
     os.remove(config.get('RandomForest_test', 'modelfile'))
Beispiel #10
0
def main():
    random.seed(0)
    parser = ArgumentParser()
    parser.add_argument('--plot', help='Where to save plot (file name)',
                        default=False)
    parser.add_argument('--show', help='Show plot in a window', default=False, 
                        action='store_true')
    args = parser.parse_args()
    
    print 'Loading training data from CSV...'
    tr_data, tr_labels, _ = csv2numpy(config.get('datasets', 'contagio'))
    
    print 'Loading test data from CSV...'
    te_data, te_labels, te_fnames = csv2numpy(config.get('datasets', 
                                                         'contagio_test'))
    
    print 'Evaluating...'
    scores = fig9(tr_data, tr_labels, te_data, te_labels, te_fnames)
    
    if not (args.plot or args.show):
        return 0
    
    # Plot
    feat_points, file_points = zip(*scores)
    fig = pyplot.figure()
    pyplot.plot(feat_points, label='Feature space', 
                marker='o', color='k', linewidth=2)
    pyplot.plot(file_points, label='Problem space', 
                marker='^', color='k', linewidth=2, linestyle='--')
    axes = fig.gca()
    
    # Set up axes and labels
    axes.yaxis.set_ticks([r / 10.0 for r in range(11)])
    axes.yaxis.grid()
    axes.set_ylim(0, 1)
    axes.set_ylabel('Accuracy')
    xticklabels = [common.top_feats[0]] + ['(+) ' + name 
                                           for name in common.top_feats[1:]]
    axes.set_xticklabels(xticklabels, rotation=60, ha='right')
    
    fig.subplots_adjust(bottom=0.34, top=0.95, left=0.11, right=0.98)
    pyplot.legend(loc='lower left')
    if args.show:
        pyplot.show()
    if args.plot:
        pyplot.savefig(args.plot, dpi=300, bbox_inches='tight')
    
    return 0
def _learn_model(scenario_name):
    '''
    Learns a classifier model for the specified scenario if one does 
    not already exist. 
    '''
    scenario = _scenarios[scenario_name]
    if path.exists(scenario['model']):
        return

    print 'Training the model for scenario {}...'.format(scenario_name)
    # Decide on classifier
    classifier = 0
    if scenario['classifier'] == 'rf':
        classifier = RandomForest()
        sys.stdout.write('TRAINING RANDOM FOREST\n')
        cutoff = [c * 0.1 for c in range(1, 10)]
    elif scenario['classifier'] == 'svm':
        classifier = sklearn_SVC(kernel='rbf', C=10, gamma=0.01)
        sys.stdout.write('TRAINING SVM\n')
        cutoff = [0.0]

    # Load the required dataset and train the model
    X, y, _ = datasets.csv2numpy(scenario['training'])
    classifier.fit(X, y)

    # Evaluate the model on the training dataset
    y_pred = classifier.decision_function(X)
    sys.stdout.write('Performance on training data:\n')
    utility.print_stats_cutoff(y, y_pred, cutoff)

    # Save the model in the corresponding file
    classifier.save_model(scenario['model'])
Beispiel #12
0
def _learn_model(scenario_name):
    '''
    Learns a classifier model for the specified scenario if one does 
    not already exist. 
    '''
    scenario = _scenarios[scenario_name]
    if path.exists(scenario['model']):
        return
    
    print 'Training the model for scenario {}...'.format(scenario_name)
    # Decide on classifier
    classifier = 0
    if scenario['classifier'] == 'rf':
        classifier = RandomForest()
        sys.stdout.write('TRAINING RANDOM FOREST\n')
        cutoff = [c * 0.1 for c in range(1, 10)]
    elif scenario['classifier'] == 'svm':
        classifier = sklearn_SVC(kernel='rbf', C=10, gamma=0.01)
        sys.stdout.write('TRAINING SVM\n')
        cutoff = [0.0]
    
    # Load the required dataset and train the model
    X, y, _ = datasets.csv2numpy(scenario['training'])
    classifier.fit(X, y)
    
    # Evaluate the model on the training dataset
    y_pred = classifier.decision_function(X)
    sys.stdout.write('Performance on training data:\n')
    utility.print_stats_cutoff(y, y_pred, cutoff)
    
    # Save the model in the corresponding file
    classifier.save_model(scenario['model'])
Beispiel #13
0
def main():
    random.seed(0)
    parser = ArgumentParser()
    parser.add_argument("--plot", help="Where to save plot (file name)", default=False)
    parser.add_argument("--show", help="Show plot in a window", default=False)
    args = parser.parse_args()

    print "Loading training data from CSV..."
    tr_data, tr_labels, tr_fnames = csv2numpy(config.get("datasets", "contagio"))

    print "Loading test data from CSV..."
    te_data, te_labels, _ = csv2numpy(config.get("datasets", "contagio_test"))

    print "Evaluating..."
    scores = fig11(tr_data, tr_labels, te_data, te_labels, tr_fnames)

    if not (args.plot or args.show):
        return 0

    # Plot
    original, our_mimicry = zip(*scores)
    fig = pyplot.figure()
    pyplot.plot(original, label="Clean data", marker="o", color="k", linewidth=2)
    pyplot.plot(our_mimicry, label="Our mimicry", marker="+", color="k", linewidth=2, linestyle=":")
    axes = fig.gca()

    # Set up axes and labels
    axes.yaxis.set_ticks([r / 10.0 for r in range(11)])
    axes.yaxis.grid()
    axes.set_ylim(0, 1)
    axes.set_ylabel("Accuracy")
    xticklabels = ["0", "0.05", "0.1", "0.5", "1", "5", "10", "50", "100"]
    axes.set_xticklabels(xticklabels, rotation=0)
    axes.set_xlabel("Training set perturbation (%)")

    fig.subplots_adjust(bottom=0.13, top=0.95, left=0.11, right=0.96)
    pyplot.legend(loc="lower right")
    if args.show:
        pyplot.show()
    if args.plot:
        pyplot.savefig(args.plot, dpi=300, bbox_inches="tight")

    return 0
Beispiel #14
0
 def test_fit(self):
     X, y, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
def attack_mimicry(scenario_name, plot=False):
    '''
    Invokes the mimcry attack for the given scenario and saves the 
    resulting attack files in the location specified by the 
    configuration file. If plot evaluates to True, saves the resulting 
    plot into the specified file, otherwise shows the plot in a window. 
    '''
    # Configuration (should be mofified here)
    target_files = "replace with the path to the .csv of the pdfrateR benign files in the training data (without standardizartion)"
    is_binary = "set it to be true if pdfrateB is to be evaluated"
    scaler_path = "replaced with the path to the scaler"
    training_data_path = "replaced with the path to .csv of the standardized training data of pdfrateR"
    X_train, y_train, _ = datasets.csv2numpy(training_data_path)
    test_data_path = "replaced with the path to .csv of the standardized training data of pdfrateB"
    X_test, y_test, _ = datasets.csv2numpy(test_data_path)
    n_trial = 30
    model_path = "Replaced with the path to the model to be evaluated"

    print 'Running the mimicry attack...'
    #print("First train and test the target classifier")
    #classifier = SVC(kernel='rbf', C=10, gamma=0.01)
    #classifier.fit(X_train, y_train)
    #y_pred = classifier.predict(X_test)
    #print(confusion_matrix(y_pred, y_test))

    print("First load the classifier")
    classifier = pickle.load(open(model_path, 'r'))

    scenario = _scenarios[scenario_name]
    output_dir = config.get('results', '{}_mimicry'.format(scenario_name))

    # Make results reproducible
    random.seed(0)

    # Get the most 30 benign files
    ben_file_paths = target_files
    X_ben, y_ben, ben_paths = datasets.csv2numpy(ben_file_paths)
    scaler = pickle.load(open(scaler_path))

    X_ben_copy = copy.deepcopy(X_ben)
    X_ben_scaled = scaler.transform(X_ben)

    y_score_ben = classifier.decision_function(X_ben_scaled)
    most_ben_ind = y_score_ben.argsort()[:n_trial]
    y_score_most_ben = y_score_ben[most_ben_ind]
    print(most_ben_ind)
    print("The ave score of the most 30 benign: {}".format(
        numpy.mean(y_score_most_ben)))
    print("The ave score of the benign: {}".format(numpy.mean(y_score_ben)))

    target_vectors = X_ben_copy[most_ben_ind]
    target_paths = list(numpy.array(ben_paths)[most_ben_ind])

    # Load benign files
    print("Loading the most benign files as targets")
    #print 'Loading attack targets from file "{}"'.format(target_files)
    #target_vectors, _, target_paths = datasets.csv2numpy(target_files)
    targets = zip(target_paths, target_vectors)

    # Load malicious files
    wolves = config.get('experiments', 'contagio_attack_pdfs')
    if not path.exists(wolves):
        _attack_files_missing(wolves)
    print 'Loading attack samples from file "{}"'.format(wolves)
    malicious = sorted(utility.get_pdfs(wolves))
    if not malicious:
        _attack_files_missing(wolves)

    # Standardize data points if necessary
    scaler = None
    if 'scaled' in scenario['model']:
        scaler = pickle.load(open(scaler_path))
        print 'Using scaler'

    # Set up multiprocessing
    pool = multiprocessing.Pool()
    pargs = [(mal, targets, classifier, scaler, is_binary)
             for mal in malicious]

    print 'Running the attack...'

    for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)):
        if isinstance(res, Exception):
            print res
            continue
        (target_path, mimic_path, mimic_score, wolf_score) = res
        '''
        print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score)
        print '  BEST: {p} [{s}]'.format(p=target_path, s=mimic_score)
        if path.dirname(mimic_path) != output_dir:
            print '  Moving best to {}\n'.format(path.join(output_dir, 
                                                 path.basename(mimic_path)))
            shutil.move(mimic_path, output_dir)
        '''
    print 'Saved resulting attack files to {}'.format(output_dir)
 def test_decision_function(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.decision_function(newX)) == 20)
Beispiel #17
0
def attack_mimicry(scenario_name, plot=False):
    '''
    Invokes the mimcry attack for the given scenario and saves the 
    resulting attack files in the location specified by the 
    configuration file. If plot evaluates to True, saves the resulting 
    plot into the specified file, otherwise shows the plot in a window. 
    '''
    print 'Running the mimicry attack...'
    _initialize()
    scenario = _scenarios[scenario_name]
    output_dir = config.get('results', '{}_mimicry'.format(scenario_name))
    # Make results reproducible
    random.seed(0)
    # Load benign files
    print 'Loading attack targets from file "{}"'.format(scenario['targets'])
    target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets'])
    targets = zip(target_paths, target_vectors)
    # Load malicious files
    wolves = config.get('experiments', 'contagio_attack_pdfs')
    if not path.exists(wolves):
        _attack_files_missing(wolves)
    print 'Loading attack samples from file "{}"'.format(wolves)
    malicious = sorted(utility.get_pdfs(wolves))
    if not malicious:
        _attack_files_missing(wolves)
    
    # Set up classifier
    classifier = 0
    if scenario['classifier'] == 'rf':
        classifier = RandomForest()
        print 'ATTACKING RANDOM FOREST'
    elif scenario['classifier'] == 'svm':
        classifier = sklearn_SVC()
        print 'ATTACKING SVM'
    print 'Loading model from "{}"'.format(scenario['model'])
    classifier.load_model(scenario['model'])
    
    # Standardize data points if necessary
    scaler = None
    if 'scaled' in scenario['model']:
        scaler = pickle.load(open(config.get('datasets', 'contagio_scaler')))
        print 'Using scaler'
    
    # Set up multiprocessing
    pool = multiprocessing.Pool()
    pargs = [(mal, targets, classifier, scaler) for mal in malicious]
    
    if plot:
        pyplot.figure(1)
    print 'Running the attack...'
    for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)):
        if isinstance(res, Exception):
            print res
            continue
        (target_path, mimic_path, mimic_score, wolf_score) = res
        print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score)
        print '  BEST: {p} [{s}]'.format(p=target_path, s=mimic_score)
        if path.dirname(mimic_path) != output_dir:
            print '  Moving best to {}\n'.format(path.join(output_dir, 
                                                 path.basename(mimic_path)))
            shutil.move(mimic_path, output_dir)
        if plot:
            pyplot.plot([wolf_score, mimic_score])
    
    print 'Saved resulting attack files to {}'.format(output_dir)
    
    if plot:
        pyplot.title('Mimicry attack')
        axes = pyplot.axes()
        axes.set_xlabel('Iterations')
        axes.set_ylabel('Classifier score')
        axes.yaxis.grid()
        fig = pyplot.gcf()
        fig.set_size_inches(6, 4.5)
        fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96)
        if plot == 'show':
            pyplot.show()
        else:
            pyplot.savefig(plot, dpi=300)
            print 'Saved plot to file {}'.format(plot)
Beispiel #18
0
def attack_gdkde(scenario_name, plot=False):
    '''
    Invokes the GD-KDE attack for the given scenario and saves the 
    resulting attack files in the location specified by the 
    configuration file. If plot evaluates to True, saves the resulting 
    plot into the specified file, otherwise shows the plot in a window. 
    '''
    print 'Running the GD-KDE attack...'
    _initialize()
    scenario = _scenarios[scenario_name]
    output_dir = config.get('results', '{}_gdkde'.format(scenario_name))
    # Make results reproducible
    random.seed(0)
    # Load and print malicious files
    wolves = config.get('experiments', 'contagio_attack_pdfs')
    if not path.exists(wolves):
        _attack_files_missing(wolves)
    print 'Loading attack samples from "{}"'.format(wolves)
    malicious = utility.get_pdfs(wolves)
    if not malicious:
        _attack_files_missing(wolves)
    
    # Load an SVM trained with scaled data
    scaler = pickle.load(open(
                        config.get('datasets', 'contagio_scaler')))
    print 'Using scaler'
    svm = sklearn_SVC()
    print 'Loading model from "{}"'.format(scenario['model'])
    svm.load_model(scenario['model'])
    
    # Load the training data used for kernel density estimation
    print 'Loading dataset from file "{}"'.format(scenario['training'])
    X_train, y_train, _ = datasets.csv2numpy(scenario['training'])
    # Subsample for faster execution
    ind_sample = random.sample(range(len(y_train)), 500)
    X_train = X_train[ind_sample, :]
    y_train = y_train[ind_sample]
    
    # Set parameters
    kde_reg = 10
    kde_width = 50
    step = 1
    max_iter = 50
    
    # Set up multiprocessing
    pool = multiprocessing.Pool()
    pargs = [(svm, fname, scaler, X_train, y_train, kde_reg, 
                  kde_width, step, max_iter, False) for fname in malicious]
    
    if plot:
        pyplot.figure(1)
    print 'Running the attack...'
    for res, oldf in zip(pool.imap(_gdkde_wrapper, pargs), malicious):
        if isinstance(res, Exception):
            print res
            continue
        (_, fseq, _, _, attack_file) = res
        print 'Processing file "{}":'.format(oldf)
        print '  scores: {}'.format(', '.join([str(s) for s in fseq]))
        print 'Result: "{}"'.format(attack_file)
        if path.dirname(attack_file) != output_dir:
            shutil.move(attack_file, output_dir)
        if plot:
            pyplot.plot(fseq, label=oldf)
    
    print 'Saved resulting attack files to {}'.format(output_dir)
    
    if plot:
        pyplot.title('GD-KDE attack')
        axes = pyplot.axes()
        axes.set_xlabel('Iterations')
        axes.set_xlim(0, max_iter + 1)
        axes.set_ylabel('SVM score')
        axes.yaxis.grid()
        fig = pyplot.gcf()
        fig.set_size_inches(6, 4.5)
        fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96)
        if plot == 'show':
            pyplot.show()
        else:
            pyplot.savefig(plot, dpi=300)
            print 'Saved plot to file {}'.format(plot)
    if model_name == '1':
        model = Model1()
        save_model_name = './Model1.h5'
    elif model_name == '2':
        model = Model2()
        save_model_name = './Model2.h5'
    elif model_name == '3':
        model = Model3()
        save_model_name = './Model3.h5'
    else:
        print(bcolors.FAIL + 'invalid model name, must one of 1, 2 or 3' +
              bcolors.ENDC)

    # the data, shuffled and split between train and test sets
    X_train, y_train, _ = datasets.csv2numpy('./dataset/train.csv')
    X_test, y_test, _ = datasets.csv2numpy('./dataset/test.csv')

    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    y_train = to_categorical(y_train, 2)
    y_test = to_categorical(y_test, 2)

    # trainig
    model.fit(X_train,
              y_train,
              batch_size=batch_size,
              nb_epoch=nb_epoch,
              validation_data=(X_test, y_test),
 def test_fit(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
 def test_predict(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.predict(newX)) == 20)
Beispiel #22
0
 def test_decision_function(self):
     X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata'))
     self.svc.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata'))
     self.assertTrue(len(self.svc.decision_function(newX)) == 20)
def attack_gdkde(scenario_name, plot=False):
    '''
    Invokes the GD-KDE attack for the given scenario and saves the 
    resulting attack files in the location specified by the 
    configuration file. If plot evaluates to True, saves the resulting 
    plot into the specified file, otherwise shows the plot in a window. 
    '''
    print 'Running the GD-KDE attack...'
    _initialize()
    scenario = _scenarios[scenario_name]
    output_dir = config.get('results', '{}_gdkde'.format(scenario_name))
    # Make results reproducible
    random.seed(0)
    # Load and print malicious files
    wolves = config.get('experiments', 'contagio_attack_pdfs')
    if not path.exists(wolves):
        _attack_files_missing(wolves)
    print 'Loading attack samples from "{}"'.format(wolves)
    malicious = utility.get_pdfs(wolves)
    if not malicious:
        _attack_files_missing(wolves)

    # Load an SVM trained with scaled data
    scaler = pickle.load(open(config.get('datasets', 'contagio_scaler')))
    print 'Using scaler'
    svm = sklearn_SVC()
    print 'Loading model from "{}"'.format(scenario['model'])
    svm.load_model(scenario['model'])

    # Load the training data used for kernel density estimation
    print 'Loading dataset from file "{}"'.format(scenario['training'])
    X_train, y_train, _ = datasets.csv2numpy(scenario['training'])
    # Subsample for faster execution
    ind_sample = random.sample(range(len(y_train)), 500)
    X_train = X_train[ind_sample, :]
    y_train = y_train[ind_sample]

    # Set parameters
    kde_reg = 10
    kde_width = 50
    step = 1
    max_iter = 50

    # Set up multiprocessing
    pool = multiprocessing.Pool()
    pargs = [(svm, fname, scaler, X_train, y_train, kde_reg, kde_width, step,
              max_iter, False) for fname in malicious]

    if plot:
        pyplot.figure(1)
    print 'Running the attack...'
    for res, oldf in zip(pool.imap(_gdkde_wrapper, pargs), malicious):
        if isinstance(res, Exception):
            print res
            continue
        (_, fseq, _, _, attack_file) = res
        print 'Processing file "{}":'.format(oldf)
        print '  scores: {}'.format(', '.join([str(s) for s in fseq]))
        print 'Result: "{}"'.format(attack_file)
        if path.dirname(attack_file) != output_dir:
            shutil.move(attack_file, output_dir)
        if plot:
            pyplot.plot(fseq, label=oldf)

    print 'Saved resulting attack files to {}'.format(output_dir)

    if plot:
        pyplot.title('GD-KDE attack')
        axes = pyplot.axes()
        axes.set_xlabel('Iterations')
        axes.set_xlim(0, max_iter + 1)
        axes.set_ylabel('SVM score')
        axes.yaxis.grid()
        fig = pyplot.gcf()
        fig.set_size_inches(6, 4.5)
        fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96)
        if plot == 'show':
            pyplot.show()
        else:
            pyplot.savefig(plot, dpi=300)
            print 'Saved plot to file {}'.format(plot)
def attack_mimicry(scenario_name, plot=False):
    '''
    Invokes the mimcry attack for the given scenario and saves the 
    resulting attack files in the location specified by the 
    configuration file. If plot evaluates to True, saves the resulting 
    plot into the specified file, otherwise shows the plot in a window. 
    '''
    print 'Running the mimicry attack...'
    _initialize()
    scenario = _scenarios[scenario_name]
    output_dir = config.get('results', '{}_mimicry'.format(scenario_name))
    # Make results reproducible
    random.seed(0)
    # Load benign files
    print 'Loading attack targets from file "{}"'.format(scenario['targets'])
    target_vectors, _, target_paths = datasets.csv2numpy(scenario['targets'])
    targets = zip(target_paths, target_vectors)
    # Load malicious files
    wolves = config.get('experiments', 'contagio_attack_pdfs')
    if not path.exists(wolves):
        _attack_files_missing(wolves)
    print 'Loading attack samples from file "{}"'.format(wolves)
    malicious = sorted(utility.get_pdfs(wolves))
    if not malicious:
        _attack_files_missing(wolves)

    # Set up classifier
    classifier = 0
    if scenario['classifier'] == 'rf':
        classifier = RandomForest()
        print 'ATTACKING RANDOM FOREST'
    elif scenario['classifier'] == 'svm':
        classifier = sklearn_SVC()
        print 'ATTACKING SVM'
    print 'Loading model from "{}"'.format(scenario['model'])
    classifier.load_model(scenario['model'])

    # Standardize data points if necessary
    scaler = None
    if 'scaled' in scenario['model']:
        scaler = pickle.load(open(config.get('datasets', 'contagio_scaler')))
        print 'Using scaler'

    # Set up multiprocessing
    pool = multiprocessing.Pool()
    pargs = [(mal, targets, classifier, scaler) for mal in malicious]

    if plot:
        pyplot.figure(1)
    print 'Running the attack...'
    for wolf_path, res in zip(malicious, pool.imap(_mimicry_wrap, pargs)):
        if isinstance(res, Exception):
            print res
            continue
        (target_path, mimic_path, mimic_score, wolf_score) = res
        print 'Modifying {p} [{s}]:'.format(p=wolf_path, s=wolf_score)
        print '  BEST: {p} [{s}]'.format(p=target_path, s=mimic_score)
        if path.dirname(mimic_path) != output_dir:
            print '  Moving best to {}\n'.format(
                path.join(output_dir, path.basename(wolf_path)))
            shutil.move(mimic_path,
                        path.join(output_dir, path.basename(wolf_path)))
        if plot:
            pyplot.plot([wolf_score, mimic_score])

    print 'Saved resulting attack files to {}'.format(output_dir)

    if plot:
        pyplot.title('Mimicry attack')
        axes = pyplot.axes()
        axes.set_xlabel('Iterations')
        axes.set_ylabel('Classifier score')
        axes.yaxis.grid()
        fig = pyplot.gcf()
        fig.set_size_inches(6, 4.5)
        fig.subplots_adjust(bottom=0.1, top=0.92, left=0.1, right=0.96)
        if plot == 'show':
            pyplot.show()
        else:
            pyplot.savefig(plot, dpi=300)
            print 'Saved plot to file {}'.format(plot)
Beispiel #25
0
 def test_fit(self):
     X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata'))
     self.svc.fit(X, y)