Example #1
0
def test_benchmark_rank_by_median():
    experiment_df = create_experiment_data()
    benchmark_df = experiment_df[experiment_df.benchmark == 'libxml']
    snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df)
    ranking = data_utils.benchmark_rank_by_median(snapshot_df)

    expected_ranking = pd.Series(index=['afl', 'libfuzzer'], data=[1100, 700])
    assert ranking.equals(expected_ranking)
Example #2
0
def test_fuzzers_with_not_enough_samples():
    experiment_df = create_experiment_data()
    # Drop one of the afl/libxml trials (trial id 5).
    experiment_df = experiment_df[experiment_df.trial_id != 5]
    benchmark_df = experiment_df[experiment_df.benchmark == 'libxml']
    snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df)

    expected_fuzzers = ['afl']
    assert data_utils.get_fuzzers_with_not_enough_samples(
        snapshot_df) == expected_fuzzers
Example #3
0
def test_benchmark_rank_by_stat_test_wins():
    experiment_df = create_experiment_data()
    benchmark_df = experiment_df[experiment_df.benchmark == 'libxml']
    snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df)
    ranking = data_utils.benchmark_rank_by_stat_test_wins(snapshot_df)

    expected_ranking = pd.Series(index=['libfuzzer', 'afl'], data=[0, 0])
    ranking.sort_index(inplace=True)
    expected_ranking.sort_index(inplace=True)
    assert ranking.equals(expected_ranking)
Example #4
0
    def coverage_growth_plot(self, benchmark_df, axes=None):
        """Draws coverage growth plot on given |axes|.

        The fuzzer labels will be in the order of their mean coverage at the
        snapshot time (typically, the end of experiment).
        """
        benchmark_names = benchmark_df.benchmark.unique()
        assert len(benchmark_names) == 1, 'Not a single benchmark data!'

        benchmark_snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df)
        snapshot_time = benchmark_snapshot_df.time.unique()[0]
        fuzzer_order = data_utils.benchmark_rank_by_mean(
            benchmark_snapshot_df).index

        axes = sns.lineplot(
            y='edges_covered',
            x='time',
            hue='fuzzer',
            hue_order=fuzzer_order,
            data=benchmark_df[benchmark_df.time <= snapshot_time],
            ci=None if self._quick else 95,
            palette=self._fuzzer_colors,
            ax=axes)

        axes.set_title(_formatted_title(benchmark_snapshot_df))

        # Indicate the snapshot time with a big red vertical line.
        axes.axvline(x=snapshot_time, color='r')

        # Move legend outside of the plot.
        axes.legend(bbox_to_anchor=(1.00, 1),
                    borderaxespad=0,
                    loc='upper left',
                    frameon=False)

        axes.set(ylabel='Edge coverage')
        axes.set(xlabel='Time (hour:minute)')

        if self._logscale:
            axes.set_xscale('log')
            ticks = np.logspace(
                # Start from the time of the first measurement.
                np.log10(experiment_utils.DEFAULT_SNAPSHOT_SECONDS),
                np.log10(snapshot_time + 1),  # Include tick at end time.
                _DEFAULT_TICKS_COUNT)
        else:
            ticks = np.arange(
                experiment_utils.DEFAULT_SNAPSHOT_SECONDS,
                snapshot_time + 1,  # Include tick at end time.
                snapshot_time / _DEFAULT_TICKS_COUNT)

        axes.set_xticks(ticks)
        axes.set_xticklabels([_formatted_hour_min(t) for t in ticks])

        sns.despine(ax=axes, trim=True)
Example #5
0
def test_benchmark_summary():
    experiment_df = create_experiment_data()
    benchmark_df = experiment_df[experiment_df.benchmark == 'libxml']
    snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df)
    summary = data_utils.benchmark_summary(snapshot_df)

    expected_summary = pd.DataFrame({
        'fuzzer': ['afl', 'libfuzzer'],
        'time': [9, 9],
        'count': [2, 2],
        'min': [1000, 600],
        'median': [1100, 700],
        'max': [1200, 800]
    }).set_index(['fuzzer', 'time']).astype(float)
    assert summary[['count', 'min', 'median', 'max']].equals(expected_summary)
Example #6
0
def test_benchmark_snapshot():
    """Tests that the snapshot data contains only the latest timestamp for all
    trials, in case all trials have the same lengths."""
    experiment_df = create_experiment_data()
    benchmark_df = experiment_df[experiment_df.benchmark == 'libxml']
    snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df)
    timestamps_per_trial = snapshot_df[['trial_id', 'time']]
    timestamps_per_trial.reset_index(drop=True, inplace=True)

    # The latest timestamp is 9 in the example data.
    expected_timestamps_per_trial = pd.DataFrame([{
        'trial_id': trial,
        'time': 9
    } for trial in range(4, 8)])
    assert timestamps_per_trial.equals(expected_timestamps_per_trial)
Example #7
0
    def crash_plot(self, benchmark_df, axes=None, logscale=False):
        """Draws crash plot."""
        benchmark_names = benchmark_df.benchmark.unique()
        assert len(benchmark_names) == 1, 'Not a single benchmark data!'

        benchmark_snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df)
        snapshot_time = benchmark_snapshot_df.time.unique()[0]
        crash_df = data_utils.get_crash_snaphot(benchmark_df)

        axes = sns.lineplot(y='crashes',
                            x='time',
                            hue='fuzzer',
                            data=crash_df[crash_df.time <= snapshot_time],
                            ci=None if self._quick else 95,
                            palette=self._fuzzer_colors,
                            ax=axes)

        axes.set_title(_formatted_title(benchmark_snapshot_df))

        # Indicate the snapshot time with a big red vertical line.
        axes.axvline(x=snapshot_time, color='r')

        # Move legend outside of the plot.
        axes.legend(bbox_to_anchor=(1.00, 1),
                    borderaxespad=0,
                    loc='upper left',
                    frameon=False)

        axes.set(ylabel='Unique crashes')
        axes.set(xlabel='Time (hour:minute)')

        if self._logscale or logscale:
            axes.set_xscale('log')
            ticks = np.logspace(
                # Start from the time of the first measurement.
                np.log10(experiment_utils.DEFAULT_SNAPSHOT_SECONDS),
                np.log10(snapshot_time + 1),  # Include tick at end time.
                _DEFAULT_TICKS_COUNT)
        else:
            ticks = np.arange(
                experiment_utils.DEFAULT_SNAPSHOT_SECONDS,
                snapshot_time + 1,  # Include tick at end time.
                snapshot_time / _DEFAULT_TICKS_COUNT)

        axes.set_xticks(ticks)
        axes.set_xticklabels([_formatted_hour_min(t) for t in ticks])

        sns.despine(ax=axes, trim=True)
Example #8
0
def test_benchmark_snapshot_complete(threshold):
    """Tests that the snapshot data contains only the latest timestamp for all
    trials, in case all trials have the same lengths. This should happen
    independently of the used |threshold|.
    """
    experiment_df = create_experiment_data()
    benchmark_df = experiment_df[experiment_df.benchmark == 'libxml']
    snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df, threshold)
    timestamps_per_trial = snapshot_df[['trial_id', 'time']]
    timestamps_per_trial.reset_index(drop=True, inplace=True)

    # The latest timestamp is 9 in the example data.
    libxml_trial_ids = range(4, 8)
    expected_timestamps_per_trial = pd.DataFrame([{
        'trial_id': trial,
        'time': 9
    } for trial in libxml_trial_ids])
    assert timestamps_per_trial.equals(expected_timestamps_per_trial)
Example #9
0
def test_benchmark_snapshot_incomplete(threshold, expected_snapshot_time,
                                       expected_trials_left):
    """Tests that the snapshot data created from an incomplete benchmark
    data (with some early terminating trials) contains the right trial
    snapshots with the right timestamp according to the given
    |threshold|. The function under test snapshots the benchmark data
    at the latest time where |threshold| fraction of the trials are
    still running. This means that with lower |threshold| the snapshot
    will be made later in time, but also more trials will be thrown
    out.
    """
    experiment_df = create_experiment_data(incomplete=True)
    benchmark_df = experiment_df[experiment_df.benchmark == 'libxml']
    snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df, threshold)
    timestamps_per_trial = snapshot_df[['trial_id', 'time']]
    timestamps_per_trial.reset_index(drop=True, inplace=True)

    trials_left = len(timestamps_per_trial.index)
    assert trials_left == expected_trials_left
    # All trial snapshots should have the same expected timestamp.
    assert (timestamps_per_trial['time'] == expected_snapshot_time).all()
Example #10
0
 def _benchmark_snapshot_df(self):
     return data_utils.get_benchmark_snapshot(self._benchmark_df)
Example #11
0
    def coverage_growth_plot(self,
                             benchmark_df,
                             axes=None,
                             logscale=False,
                             bugs=False):
        """Draws edge (or bug) coverage growth plot on given |axes|.

        The fuzzer labels will be in the order of their mean coverage at the
        snapshot time (typically, the end of experiment).
        """
        self._common_datafame_checks(benchmark_df)

        column_of_interest = 'bugs_covered' if bugs else 'edges_covered'

        benchmark_snapshot_df = data_utils.get_benchmark_snapshot(benchmark_df)
        snapshot_time = benchmark_snapshot_df.time.unique()[0]
        fuzzer_order = data_utils.benchmark_rank_by_mean(
            benchmark_snapshot_df, key=column_of_interest).index

        axes = sns.lineplot(
            y=column_of_interest,
            x='time',
            hue='fuzzer',
            hue_order=fuzzer_order,
            data=benchmark_df[benchmark_df.time <= snapshot_time],
            ci=None if bugs or self._quick else 95,
            estimator=np.median,
            palette=self._fuzzer_colors,
            style='fuzzer',
            dashes=False,
            markers=self._fuzzer_markers,
            ax=axes)

        axes.set_title(_formatted_title(benchmark_snapshot_df))

        # Indicate the snapshot time with a big red vertical line.
        axes.axvline(x=snapshot_time, color='r')

        # Move legend outside of the plot.
        axes.legend(bbox_to_anchor=(1.00, 1),
                    borderaxespad=0,
                    loc='upper left',
                    frameon=False)

        axes.set(ylabel='Bug coverage' if bugs else 'Code region coverage')
        axes.set(xlabel='Time (hour:minute)')

        if self._logscale or logscale:
            axes.set_xscale('log')
            ticks = np.logspace(
                # Start from the time of the first measurement.
                np.log10(experiment_utils.DEFAULT_SNAPSHOT_SECONDS),
                np.log10(snapshot_time + 1),  # Include tick at end time.
                _DEFAULT_TICKS_COUNT)
        else:
            ticks = np.arange(
                experiment_utils.DEFAULT_SNAPSHOT_SECONDS,
                snapshot_time + 1,  # Include tick at end time.
                snapshot_time / _DEFAULT_TICKS_COUNT)

        axes.set_xticks(ticks)
        axes.set_xticklabels([_formatted_hour_min(t) for t in ticks])

        sns.despine(ax=axes, trim=True)