Esempio n. 1
0
def perform_experiment(train_fs,
                       test_fs,
                       avstats_in,
                       binarize,
                       classifier='RF',
                       subsample=False):
    print('Performing experiment')
    res = []
    key_dates = []
    avstats = collections.defaultdict(int)
    for w, (f_tr, f_te) in enumerate(zip(train_fs, test_fs), start=1):
        # Load test dates
        dates = numpy.array(load_dates(f_te))
        week_s, week_e = dates.min(), dates.max()
        key_dates.append(week_s)
        print('\nPeriod {} [{} - {}]'.format(w, week_s, week_e))

        # Load training data
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_tr, y_tr = datasets.load_svmlight_file(f_tr)
        print(X_tr.shape)
        if subsample:
            new_size = int(round(X_tr.shape[0] * subsample))
            subsam = numpy.random.choice(X_tr.shape[0], new_size)
            X_tr = X_tr[subsam, :]
            y_tr = y_tr[subsam]
        if binarize:
            X_tr.data = numpy.ones_like(X_tr.data)
        X_tr = X_tr.toarray()

        # Train classifier
        if classifier == 'RF':
            clf = RFC(n_estimators=200, n_jobs=1 if subsample else -1)
        elif classifier == 'SVM':
            clf = SVC(kernel='rbf', gamma=0.0025, C=12)
        sample_weight = None
        print('Training set size: {}'.format(X_tr.shape))
        clf.fit(X_tr, y_tr, sample_weight=sample_weight)
        tr_n_feats = X_tr.shape[1]
        del X_tr

        # Load and classify test data
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_te, y_te = datasets.load_svmlight_file(f_te,
                                                     n_features=tr_n_feats)
        if binarize:
            X_te.data = numpy.ones_like(X_te.data)
        X_te = X_te.toarray()
        print('Test set size: {}'.format(X_te.shape))
        y_pr = clf.predict(X_te)
        if classifier == 'RF':
            y_val = clf.predict_proba(X_te)[:, 1]
        elif classifier == 'SVM':
            y_val = clf.decision_function(X_te)
        del X_te

        # Evaluate experimental results
        res.append(experiment_stats(y_tr, y_te, y_pr, y_val))

        # Load file IDs
        fileIDs = numpy.array(load_SHA256_sums(f_te))[numpy.where(y_te > 0.5)]

        # Update AV detection results
        for fid in fileIDs:
            avstats['Total'] += 1
            if fid in avstats_in:
                for av, det in avstats_in[fid]['report'].iteritems():
                    if det:
                        avstats[av] += 1
        del fileIDs
        avstats['Hidost'] += numpy.logical_and(y_te == y_pr, y_te > 0.5).sum()
    res = numpy.concatenate(res)
    return res, key_dates, avstats
Esempio n. 2
0
def perform_experiment(train_fs, test_fs, avstats_in, binarize,
                       classifier='RF', subsample=False):
    print('Performing experiment')
    res = []
    key_dates = []
    avstats = collections.defaultdict(int)
    for w, (f_tr, f_te) in enumerate(zip(train_fs, test_fs), start=1):
        # Load test dates
        dates = numpy.array(load_dates(f_te))
        week_s, week_e = dates.min(), dates.max()
        key_dates.append(week_s)
        print('\nPeriod {} [{} - {}]'.format(w, week_s, week_e))

        # Load training data
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_tr, y_tr = datasets.load_svmlight_file(f_tr)
        print(X_tr.shape)
        if subsample:
            new_size = int(round(X_tr.shape[0] * subsample))
            subsam = numpy.random.choice(X_tr.shape[0], new_size)
            X_tr = X_tr[subsam, :]
            y_tr = y_tr[subsam]
        if binarize:
            X_tr.data = numpy.ones_like(X_tr.data)
        X_tr = X_tr.toarray()

        # Train classifier
        if classifier == 'RF':
            clf = RFC(n_estimators=200, n_jobs=1 if subsample else -1)
        elif classifier == 'SVM':
            clf = SVC(kernel='rbf', gamma=0.0025, C=12)
        sample_weight = None
        print('Training set size: {}'.format(X_tr.shape))
        clf.fit(X_tr, y_tr, sample_weight=sample_weight)
        tr_n_feats = X_tr.shape[1]
        del X_tr

        # Load and classify test data
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_te, y_te = datasets.load_svmlight_file(f_te, n_features=tr_n_feats)
        if binarize:
            X_te.data = numpy.ones_like(X_te.data)
        X_te = X_te.toarray()
        print('Test set size: {}'.format(X_te.shape))
        y_pr = clf.predict(X_te)
        if classifier == 'RF':
            y_val = clf.predict_proba(X_te)[:, 1]
        elif classifier == 'SVM':
            y_val = clf.decision_function(X_te)
        del X_te

        # Evaluate experimental results
        res.append(experiment_stats(y_tr, y_te, y_pr, y_val))

        # Load file IDs
        fileIDs = numpy.array(
            load_SHA256_sums(f_te))[numpy.where(y_te > 0.5)]

        # Update AV detection results
        for fid in fileIDs:
            avstats['Total'] += 1
            if fid in avstats_in:
                for av, det in avstats_in[fid]['report'].iteritems():
                    if det:
                        avstats[av] += 1
        del fileIDs
        avstats['Hidost'] += numpy.logical_and(y_te == y_pr, y_te > 0.5).sum()
    res = numpy.concatenate(res)
    return res, key_dates, avstats
Esempio n. 3
0
def main():
    parser = ArgumentParser(description=__doc__)
    parser.add_argument('--train',
                        nargs='+',
                        required=True,
                        help='Training data file(s).')
    parser.add_argument('--test',
                        nargs='+',
                        required=True,
                        help='Test data file(s).')
    parser.add_argument('-l', '--log',
                        action='store_true',
                        help='X-axis log scale.')
    parser.add_argument('--legend',
                        default=False,
                        help='Where to put legend.')
    parser.add_argument('--data-plot',
                        required=True,
                        nargs='*',
                        help='Where to save data quantity plot.')

    args = parser.parse_args()

    print('\nEvaluating data in time periods')
    res = []
    key_dates = []
    all_years = set()
    for w, (f_tr, f_te) in enumerate(zip(args.train, args.test), start=1):
        # Load test data
        y_te = load_libsvm_labels(f_te)
        pos_te, neg_te = (y_te > 0.5).sum(), (y_te < 0.5).sum()

        # Load test dates
        dates = numpy.array(load_dates(f_te))
        week_s, week_e = dates.min(), dates.max()
        key_dates.append(week_s)
        print('Period {} [{} - {}]'.format(w, week_s, week_e))
        all_years.add(str(week_s.year))

        # Load training data
        y_tr = load_libsvm_labels(f_tr)
        pos_tr, neg_tr = (y_tr > 0.5).sum(), (y_tr < 0.5).sum()

        print('Training: {} malicious, {} benign'.format(pos_tr, neg_tr))
        print('Test: {} malicious, {} benign'.format(pos_te, neg_te),
              end='\n\n')
        res.append((pos_tr, neg_tr, pos_te, neg_te))

    pos_tr, neg_tr, pos_te, neg_te = zip(*res)
    print('Dates ranging from {} to {}'.format(key_dates[0], key_dates[-1]))
    print('Total days: {}'.format((key_dates[-1] - key_dates[0]).days + 1))

    print('Plotting training and test sizes')
    bar_width = 0.35
    spacing = 0.05  # spacing between a pair of training/test bars
    xticks = numpy.arange(len(pos_tr)).astype(numpy.float32)

    # Plot
    plots.init_eurasip_style(figure_width=222.5, figure_height=170.0)
    fig = pylab.figure()
    ax = pylab.gca()
    ax.bar(xticks - bar_width - spacing, neg_tr, width=bar_width,
           color='#00691f', linewidth=0, label='Benign training')
    ax.bar(xticks - bar_width - spacing, pos_tr, bottom=neg_tr,
           width=bar_width, color='#a50007', linewidth=0,
           label='Malicious training')
    ax.bar(xticks + spacing, neg_te, width=bar_width, color='#67bc6b',
           linewidth=0, label='Benign evaluation')
    ax.bar(xticks + spacing, pos_te, bottom=neg_te, width=bar_width,
           color='#ff767d', linewidth=0, label='Malicious evaluation')

    # Set up x axis
    ax.set_xticks(xticks)
    ax.set_xticklabels([d.strftime('%b %d') for d in key_dates])
    ax.set_xlim((-2.0 * spacing - bar_width,
                 len(pos_tr) - 1 + 2.0 * spacing + bar_width))
    years_range = sorted(all_years)
    if len(years_range) > 2:
        years_range = [years_range[0], years_range[-1]]
    ax.set_xlabel('Date ({})'.format(' - '.join(years_range)))
    fig.autofmt_xdate()

    # Set up y axis
    pylab.ticklabel_format(axis='y', style='sci', scilimits=(0, 2),
                           useOffset=False)
    ax.yaxis.grid()  # vertical grid lines
    ax.set_axisbelow(True)  # grid lines are behind the rest
    if args.log:
        ax.set_yscale('log')
    ax.set_ylabel('Samples')

    # Set up legend
    legend_loc = args.legend if args.legend else 'best'
    if legend_loc != 'none':
        pylab.legend(loc=legend_loc, fancybox=True, framealpha=0.5)

    # Finalize plot setup
    pylab.tight_layout(pad=0.5, h_pad=0.5, w_pad=0.5, rect=(0, 0, 1, 1))
    for plot_file in args.data_plot:
        pylab.savefig(plot_file)

    return 0