コード例 #1
0
def evaluate_traffic(gen_df, test_df, verbose=True):
    restored_packets, from_idx = select_features(gen_df)
    client_gen_packets = restored_packets[from_idx]
    server_gen_packets = restored_packets[~from_idx]

    src_packets, src_from_id = select_features(test_df)
    client_src_packets = src_packets[src_from_id]
    server_src_packets = src_packets[~src_from_id]

    metrics = {
        'KS_2sample_PS_client':
        get_ks_2sample_stat(client_src_packets[:, 0], client_gen_packets[:,
                                                                         0]),
        'KS_2sample_IAT_client':
        get_ks_2sample_stat(client_src_packets[:, 1], client_gen_packets[:,
                                                                         1]),
        'KS_2sample_PS_server':
        get_ks_2sample_stat(server_src_packets[:, 0], server_gen_packets[:,
                                                                         0]),
        'KS_2sample_IAT_server':
        get_ks_2sample_stat(server_src_packets[:, 1], server_gen_packets[:,
                                                                         1]),
        'KS_2sample_thrpt_client':
        get_ks_2sample_stat(packets_to_throughput(client_src_packets),
                            packets_to_throughput(client_gen_packets)),
        'KS_2sample_thrpt_server':
        get_ks_2sample_stat(packets_to_throughput(server_src_packets),
                            packets_to_throughput(server_gen_packets)),
    }
    if verbose:
        pprint(metrics)
    return metrics
コード例 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset',
                        help='path to preprocessed .csv dataset',
                        required=True)
    args = parser.parse_args()
    ds_path = pathlib.Path(args.dataset)
    save_dir = BASE_DIR / 'obj' / ds_path.stem

    train_df, test_df = load_train_test_dataset(ds_path)

    quantizer, bic_dict = GaussianQuantizer().fit(*select_features(train_df),
                                                  min_comp=10,
                                                  max_comp=120,
                                                  step_comp=4,
                                                  return_bic_dict=True)
    quantizer.save_pretrained(save_dir)

    plot_bics(bic_dict['from'], 'От источника')
    plot_bics(bic_dict['to'], 'К источнику')
    plt.tight_layout()
    plt.savefig(save_dir / 'BICs.png', dpi=300)
    plot_packets_dist(train_df)
    plt.savefig(save_dir / 'packets.png', dpi=300)

    scaled = PacketScaler().transform(select_features(train_df)[0])
    scaled = pd.DataFrame(scaled, columns=['PS, байт / 1500', 'log(IAT, мс)'])
    scaled[ParsedFields.is_source] = train_df[
        ParsedFields.is_source].reset_index(drop=True)

    plot_packets_dist(scaled, x='log(IAT, мс)', y='PS, байт / 1500')
    plt.savefig(save_dir / 'scaled_packets.png', dpi=300)
コード例 #3
0
def quantize_datatset(quantizer,
                      train_df,
                      test_df,
                      prepend_with_init_tokens=0):
    train_states = quantizer.transform(
        *select_features(train_df),
        prepend_with_init_tokens=prepend_with_init_tokens)
    test_states = quantizer.transform(*select_features(test_df))
    return train_states, test_states
コード例 #4
0
def test_parser(pcap_file):
    flow = 'UDP 10.1.3.143:5000 10.1.6.18:2006'
    parsed = extract_flow_stats(pcap_file, flow)
    packets, from_idx = select_features(parsed)

    assert from_idx.sum() == 236
    assert packets.shape[0] == 465
    assert int(packets[from_idx, 0].sum()) == 66080
コード例 #5
0
def test_sampling(raw_host_stats_path):
    train, test = load_train_test_dataset(raw_host_stats_path)
    model = HMMGenerator()

    model.fit(*select_features(train), min_comp=10, max_comp=12)
    gen = model.sample_packets_like(test)
    metrics = evaluate_traffic(gen, test)
    print(metrics)
    assert np.isclose(metrics['KS_2sample_thrpt_client'], 0.33, atol=1e-2)
    assert np.isclose(metrics['KS_2sample_thrpt_server'], 0.33, atol=1e-2)
コード例 #6
0
    def sample_packets_like(self, reference_stats) -> pd.DataFrame:
        _, directions = select_features(reference_stats)
        from_idx = directions == True
        from_count, to_count = directions[from_idx].shape[0], directions[
            ~from_idx].shape[0]
        packets_from, _ = self.model_from.sample(from_count,
                                                 random_state=RANDOM_SEED)
        packets_to, _ = self.model_to.sample(to_count,
                                             random_state=RANDOM_SEED)
        # dumb concatenation, packet order is preserved direction-wise, not within the flow (!)
        packets_from = pd.DataFrame(
            self.scaler.inverse_transform(packets_from),
            columns=[ParsedFields.ps, ParsedFields.iat])
        packets_from[ParsedFields.is_source] = True

        packets_to = pd.DataFrame(self.scaler.inverse_transform(packets_to),
                                  columns=[ParsedFields.ps, ParsedFields.iat])
        packets_to[ParsedFields.is_source] = False
        return pd.concat([packets_from, packets_to],
                         axis=0).reset_index(drop=True)
コード例 #7
0
def test_soft_quantizer(raw_host_stats):

    source_features, directions = select_features(raw_host_stats)
    gaussian_quantizer = GaussianQuantizer().fit(source_features, directions, min_comp=5, max_comp=20)

    q_tokens = gaussian_quantizer.transform(source_features, directions, prepend_with_init_tokens=10)
    dec_features, dec_directions = gaussian_quantizer.inverse_transform(q_tokens, prob_sampling=False)

    assert (directions == dec_directions).all()
    mean_exp_values = source_features.mean(axis=0)
    mape = mean_absolute_error(source_features / mean_exp_values, dec_features / mean_exp_values)
    print(f'MAPE: {mape}')
    assert mape < 0.035
    assert get_ks_2sample_stat(source_features[:, 0], dec_features[:, 0]) < 0.54
    assert get_ks_2sample_stat(source_features[:, 1], dec_features[:, 1]) < 0.73

    prob_dec_features, prob_dec_directions = gaussian_quantizer.inverse_transform(q_tokens, prob_sampling=True)
    mape = mean_absolute_error(source_features / mean_exp_values, prob_dec_features / mean_exp_values)
    print(f'MAPE with probabilistic sampling: {mape}')
    assert mape < 0.05
    assert get_ks_2sample_stat(source_features[:, 0], prob_dec_features[:, 0]) < 0.5
    assert get_ks_2sample_stat(source_features[:, 1], prob_dec_features[:, 1]) < 0.28
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset',
                        help='path to preprocessed .csv dataset',
                        required=True)
    parser.add_argument('--log_neptune',
                        dest='log_neptune',
                        action='store_true',
                        default=False)
    args = parser.parse_args()
    ds_path = pathlib.Path(args.dataset)
    save_dir = BASE_DIR / 'obj' / ('hmm_' + ds_path.stem)

    train_df, test_df = load_train_test_dataset(ds_path)

    generator, bic_dict = HMMGenerator().fit(*select_features(train_df),
                                             min_comp=1,
                                             max_comp=40,
                                             step_comp=2,
                                             return_bic_dict=True)
    generator.save_pretrained(save_dir)

    plot_bics(bic_dict['from'], 'От источника')
    plot_bics(bic_dict['to'], 'К источнику')
    plt.tight_layout()
    plt.savefig(save_dir / 'BICs.png', dpi=300)

    gen_df = generator.sample_packets_like(test_df)
    eval_metrics = evaluate_traffic(gen_df, test_df)
    if args.log_neptune:
        neptune.init(
            NEPTUNE_PROJECT,
            NEPTUNE_API_TOKEN,
        )
        neptune.create_experiment(name='hmm_model', params=vars(args))
        for name, value in eval_metrics.items():
            neptune.log_metric(name, value)
        neptune.stop()
コード例 #9
0
def test_scaler(raw_host_stats):
    scaler = PacketScaler()
    source_features = select_features(raw_host_stats)[0]
    scaled_features = scaler.transform(source_features.copy())
    reversed_features = scaler.inverse_transform(scaled_features)
    assert np.isclose(source_features, reversed_features, atol=10e-9).all()