def evaluate_traffic(gen_df, test_df, verbose=True): restored_packets, from_idx = select_features(gen_df) client_gen_packets = restored_packets[from_idx] server_gen_packets = restored_packets[~from_idx] src_packets, src_from_id = select_features(test_df) client_src_packets = src_packets[src_from_id] server_src_packets = src_packets[~src_from_id] metrics = { 'KS_2sample_PS_client': get_ks_2sample_stat(client_src_packets[:, 0], client_gen_packets[:, 0]), 'KS_2sample_IAT_client': get_ks_2sample_stat(client_src_packets[:, 1], client_gen_packets[:, 1]), 'KS_2sample_PS_server': get_ks_2sample_stat(server_src_packets[:, 0], server_gen_packets[:, 0]), 'KS_2sample_IAT_server': get_ks_2sample_stat(server_src_packets[:, 1], server_gen_packets[:, 1]), 'KS_2sample_thrpt_client': get_ks_2sample_stat(packets_to_throughput(client_src_packets), packets_to_throughput(client_gen_packets)), 'KS_2sample_thrpt_server': get_ks_2sample_stat(packets_to_throughput(server_src_packets), packets_to_throughput(server_gen_packets)), } if verbose: pprint(metrics) return metrics
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', help='path to preprocessed .csv dataset', required=True) args = parser.parse_args() ds_path = pathlib.Path(args.dataset) save_dir = BASE_DIR / 'obj' / ds_path.stem train_df, test_df = load_train_test_dataset(ds_path) quantizer, bic_dict = GaussianQuantizer().fit(*select_features(train_df), min_comp=10, max_comp=120, step_comp=4, return_bic_dict=True) quantizer.save_pretrained(save_dir) plot_bics(bic_dict['from'], 'От источника') plot_bics(bic_dict['to'], 'К источнику') plt.tight_layout() plt.savefig(save_dir / 'BICs.png', dpi=300) plot_packets_dist(train_df) plt.savefig(save_dir / 'packets.png', dpi=300) scaled = PacketScaler().transform(select_features(train_df)[0]) scaled = pd.DataFrame(scaled, columns=['PS, байт / 1500', 'log(IAT, мс)']) scaled[ParsedFields.is_source] = train_df[ ParsedFields.is_source].reset_index(drop=True) plot_packets_dist(scaled, x='log(IAT, мс)', y='PS, байт / 1500') plt.savefig(save_dir / 'scaled_packets.png', dpi=300)
def quantize_datatset(quantizer, train_df, test_df, prepend_with_init_tokens=0): train_states = quantizer.transform( *select_features(train_df), prepend_with_init_tokens=prepend_with_init_tokens) test_states = quantizer.transform(*select_features(test_df)) return train_states, test_states
def test_parser(pcap_file): flow = 'UDP 10.1.3.143:5000 10.1.6.18:2006' parsed = extract_flow_stats(pcap_file, flow) packets, from_idx = select_features(parsed) assert from_idx.sum() == 236 assert packets.shape[0] == 465 assert int(packets[from_idx, 0].sum()) == 66080
def test_sampling(raw_host_stats_path): train, test = load_train_test_dataset(raw_host_stats_path) model = HMMGenerator() model.fit(*select_features(train), min_comp=10, max_comp=12) gen = model.sample_packets_like(test) metrics = evaluate_traffic(gen, test) print(metrics) assert np.isclose(metrics['KS_2sample_thrpt_client'], 0.33, atol=1e-2) assert np.isclose(metrics['KS_2sample_thrpt_server'], 0.33, atol=1e-2)
def sample_packets_like(self, reference_stats) -> pd.DataFrame: _, directions = select_features(reference_stats) from_idx = directions == True from_count, to_count = directions[from_idx].shape[0], directions[ ~from_idx].shape[0] packets_from, _ = self.model_from.sample(from_count, random_state=RANDOM_SEED) packets_to, _ = self.model_to.sample(to_count, random_state=RANDOM_SEED) # dumb concatenation, packet order is preserved direction-wise, not within the flow (!) packets_from = pd.DataFrame( self.scaler.inverse_transform(packets_from), columns=[ParsedFields.ps, ParsedFields.iat]) packets_from[ParsedFields.is_source] = True packets_to = pd.DataFrame(self.scaler.inverse_transform(packets_to), columns=[ParsedFields.ps, ParsedFields.iat]) packets_to[ParsedFields.is_source] = False return pd.concat([packets_from, packets_to], axis=0).reset_index(drop=True)
def test_soft_quantizer(raw_host_stats): source_features, directions = select_features(raw_host_stats) gaussian_quantizer = GaussianQuantizer().fit(source_features, directions, min_comp=5, max_comp=20) q_tokens = gaussian_quantizer.transform(source_features, directions, prepend_with_init_tokens=10) dec_features, dec_directions = gaussian_quantizer.inverse_transform(q_tokens, prob_sampling=False) assert (directions == dec_directions).all() mean_exp_values = source_features.mean(axis=0) mape = mean_absolute_error(source_features / mean_exp_values, dec_features / mean_exp_values) print(f'MAPE: {mape}') assert mape < 0.035 assert get_ks_2sample_stat(source_features[:, 0], dec_features[:, 0]) < 0.54 assert get_ks_2sample_stat(source_features[:, 1], dec_features[:, 1]) < 0.73 prob_dec_features, prob_dec_directions = gaussian_quantizer.inverse_transform(q_tokens, prob_sampling=True) mape = mean_absolute_error(source_features / mean_exp_values, prob_dec_features / mean_exp_values) print(f'MAPE with probabilistic sampling: {mape}') assert mape < 0.05 assert get_ks_2sample_stat(source_features[:, 0], prob_dec_features[:, 0]) < 0.5 assert get_ks_2sample_stat(source_features[:, 1], prob_dec_features[:, 1]) < 0.28
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', help='path to preprocessed .csv dataset', required=True) parser.add_argument('--log_neptune', dest='log_neptune', action='store_true', default=False) args = parser.parse_args() ds_path = pathlib.Path(args.dataset) save_dir = BASE_DIR / 'obj' / ('hmm_' + ds_path.stem) train_df, test_df = load_train_test_dataset(ds_path) generator, bic_dict = HMMGenerator().fit(*select_features(train_df), min_comp=1, max_comp=40, step_comp=2, return_bic_dict=True) generator.save_pretrained(save_dir) plot_bics(bic_dict['from'], 'От источника') plot_bics(bic_dict['to'], 'К источнику') plt.tight_layout() plt.savefig(save_dir / 'BICs.png', dpi=300) gen_df = generator.sample_packets_like(test_df) eval_metrics = evaluate_traffic(gen_df, test_df) if args.log_neptune: neptune.init( NEPTUNE_PROJECT, NEPTUNE_API_TOKEN, ) neptune.create_experiment(name='hmm_model', params=vars(args)) for name, value in eval_metrics.items(): neptune.log_metric(name, value) neptune.stop()
def test_scaler(raw_host_stats): scaler = PacketScaler() source_features = select_features(raw_host_stats)[0] scaled_features = scaler.transform(source_features.copy()) reversed_features = scaler.inverse_transform(scaled_features) assert np.isclose(source_features, reversed_features, atol=10e-9).all()