def get_data_by_balanced_folds(ASs, fold_idxs, required_num_samples=None): prev_autonomous_systems = global_vars.get('autonomous_systems') folds = {i: {'X_train': [], 'X_test': [], 'y_train': [], 'y_test': []} for i in range(global_vars.get('n_folds'))} for AS in ASs: global_vars.set('autonomous_systems', [AS]) dataset = get_dataset('all') concat_train_val_sets(dataset) dataset = unify_dataset(dataset) if np.count_nonzero(dataset.X) == 0: print(f'dropped AS {AS} - no common handovers') continue try: if required_num_samples is not None: assert len(dataset.X) == required_num_samples for fold_idx in range(global_vars.get('n_folds')): folds[fold_idx]['X_train'].extend(dataset.X[fold_idxs[fold_idx]['train_idxs']]) folds[fold_idx]['X_test'].extend(dataset.X[fold_idxs[fold_idx]['test_idxs']]) folds[fold_idx]['y_train'].extend(dataset.y[fold_idxs[fold_idx]['train_idxs']]) folds[fold_idx]['y_test'].extend(dataset.y[fold_idxs[fold_idx]['test_idxs']]) except IndexError: print(f'dropped AS {AS}') except AssertionError: print(f'dropped AS {AS}') for key in folds.keys(): for inner_key in folds[key].keys(): folds[key][inner_key] = np.stack(folds[key][inner_key], axis=0) global_vars.set('autonomous_systems', prev_autonomous_systems) return folds
def find_optimal_samples_report(pretrained_model, dataset, folder_name): report_file_name = f'{folder_name}/{global_vars.get("report")}.pdf' if os.path.isfile(report_file_name): return eeg_chans = list(range(global_vars.get('eeg_chans'))) plot_dict = OrderedDict() dataset = unify_dataset(dataset) for layer_idx, layer in list(enumerate(pretrained_model.children()))[global_vars.get('layer_idx_cutoff'):]: max_examples = get_max_examples_per_channel(dataset.X, layer_idx, pretrained_model) for chan_idx, example_idx in enumerate(max_examples): tf_data = [] for eeg_chan in eeg_chans: tf_data.append(get_tf_data_efficient(dataset.X[example_idx][None, :, :], eeg_chan, 250)) max_value = np.max(np.array(tf_data)) class_str = '' if layer_idx >= len(list(pretrained_model.children())) - 3: class_str = f', class:{label_by_idx(chan_idx)}' plot_dict[(layer_idx, chan_idx)] = tf_plot(tf_data, f'TF plot of example {example_idx} for layer ' f'{layer_idx}, channel {chan_idx}{class_str}',max_value) print(f'plot most activating TF for layer {layer_idx}, channel {chan_idx}') img_paths = list(plot_dict.values()) story = [] story.append(Paragraph('<br />\n'.join([f'{x}:{y}' for x,y in pretrained_model._modules.items()]), style=styles["Normal"])) for im in img_paths: story.append(get_image(im)) create_pdf_from_story(report_file_name, story) for im in img_paths: os.remove(im)
def power_diff_report(model, dataset, folder_name): report_file_name = f'{folder_name}/{global_vars.get("report")}.pdf' dataset = unify_dataset(dataset) class_examples = [] nyquist = int(global_vars.get('frequency') / 2) - 1 for class_idx in range(global_vars.get('n_classes')): class_examples.append(dataset.X[np.where(dataset.y == class_idx)]) freqs = np.fft.fftfreq(global_vars.get('input_height'), 1 / global_vars.get('frequency')) freq_idx = np.argmax(freqs >= nyquist) diff_array = np.zeros((global_vars.get('eeg_chans'), freq_idx)) for chan in list(range(global_vars.get('eeg_chans'))): first_power = np.average(np.fft.fft(class_examples[0][:, chan, :]).squeeze(), axis=0)[:freq_idx] second_power = np.average(np.fft.fft(class_examples[1][:, chan, :]).squeeze(), axis=0)[:freq_idx] power_diff = abs(first_power - second_power) diff_array[chan] = power_diff fig, ax = plt.subplots(figsize=(18, 10)) divider = make_axes_locatable(ax) cax = divider.append_axes('right', size='5%', pad=0.05) im = ax.imshow(diff_array, cmap='hot', interpolation='nearest', aspect='auto', extent=[0, nyquist, 1, global_vars.get('eeg_chans')]) ax.set_title('frequency diff between classes') ax.set_ylabel('channel') ax.set_xlabel('frequency') fig.colorbar(im, cax=cax, orientation='vertical') filename = f'temp/freq_diff.png' plt.savefig(filename) story = [get_image(tf) for tf in [filename]] create_pdf_from_story(report_file_name, story) for tf in [filename]: os.remove(tf)
def avg_class_tf_report(model, dataset, folder_name): report_file_name = f'{folder_name}/{global_vars.get("report")}.pdf' if os.path.isfile(report_file_name): return eeg_chans = list(range(global_vars.get('eeg_chans'))) dataset = unify_dataset(dataset) class_examples = [] for class_idx in range(global_vars.get('n_classes')): class_examples.append(dataset.X[np.where(dataset.y == class_idx)]) if global_vars.get('to_eeglab'): tensor_to_eeglab(class_examples[-1], f'{folder_name}/avg_class_tf/{label_by_idx(class_idx)}.mat') chan_data = np.zeros((global_vars.get('n_classes'), len(eeg_chans), global_vars.get('num_frex'), global_vars.get('input_height'))) for class_idx in range(global_vars.get('n_classes')): for eeg_chan in eeg_chans: chan_data[class_idx, eeg_chan] = get_tf_data_efficient(class_examples[class_idx], eeg_chan, global_vars.get('frequency'), global_vars.get('num_frex'), dB=global_vars.get('db_normalization')) max_value = np.max(chan_data) tf_plots = [] for class_idx in range(global_vars.get('n_classes')): tf_plots.append(tf_plot(chan_data[class_idx], f'average TF for {label_by_idx(class_idx)}', max_value)) story = [get_image(tf) for tf in tf_plots] create_pdf_from_story(report_file_name, story) for tf in tf_plots: os.remove(tf)
def feature_importance_report(model, dataset, folder_name): FEATURE_VALUES = {} feature_mean = {} vmin = np.inf vmax = -np.inf report_file_name = f'{folder_name}/{global_vars.get("report")}_{global_vars.get("explainer")}.pdf' train_data = np_to_var(dataset['train'].X[:, :, :, None]) model.cpu() if 'Ensemble' in type(model).__name__: for mod in model.models: if 'Ensemble' in type(mod).__name__: for inner_mod in mod.models: inner_mod.cpu() inner_mod.eval() mod.cpu() mod.eval() e = globals()[f'{global_vars.get("explainer")}_explainer'](model, train_data) shap_imgs = [] for segment in ['test']: if segment == 'both': dataset = unify_dataset(dataset) segment_data = np_to_var(dataset.X[:, :, :, None]) else: segment_data = np_to_var(dataset[segment].X[:, :, :, None]) print(f'calculating {global_vars.get("explainer")} values for {int(segment_data.shape[0] * global_vars.get("explainer_sampling_rate"))} samples') segment_examples = segment_data[np.random.choice(segment_data.shape[0], int(segment_data.shape[0] * global_vars.get("explainer_sampling_rate")), replace=False)] feature_values = e.get_feature_importance(segment_examples) feature_val = np.array(feature_values).squeeze() if feature_val.ndim == 4: feature_mean[segment] = np.mean(feature_val, axis=1) else: feature_mean[segment] = feature_val if global_vars.get('dataset') == 'netflow_asflow': save_feature_importances(folder_name, feature_mean[segment]) else: np.save(f'{folder_name}/{global_vars.get("explainer")}_{segment}.npy', feature_mean[segment]) feature_value = np.concatenate(feature_mean[segment], axis=0) feature_value = (feature_value - np.mean(feature_value)) / np.std(feature_value) FEATURE_VALUES[segment] = feature_value if feature_mean[segment].min() < vmin: vmin = feature_mean[segment].min() if feature_mean[segment].max() > vmax: vmax = feature_mean[segment].max() for segment in ['test']: img_file = plot_feature_importance_netflow(folder_name, feature_mean[segment], global_vars.get('start_hour'), global_vars.get('dataset'), segment, global_vars.get('explainer')) if global_vars.get('dataset') != 'netflow_asflow': plot_topo_feature_importance(folder_name, feature_mean[segment]) shap_imgs.append(img_file) story = [] for im in shap_imgs: story.append(get_image(im)) create_pdf_from_story(report_file_name, story) global_vars.get('sacred_ex').add_artifact(report_file_name) for im in shap_imgs: os.remove(im) gc.collect() return FEATURE_VALUES
def get_leave_one_out(data_folder, test_subject_id): X_train = [] y_train = [] X_test = [] y_test = [] for subject_id in global_vars.get('subjects_to_check'): if subject_id != test_subject_id: dataset = get_dataset(subject_id) dataset = unify_dataset(dataset) X_train.extend(dataset.X) y_train.extend(dataset.y) test_dataset = get_dataset(test_subject_id) test_dataset = unify_dataset(test_dataset) X_test.extend(test_dataset.X) y_test.extend(test_dataset.y) X_train = np.array(X_train) X_test = np.array(X_test) y_train = np.array(y_train) y_test = np.array(y_test) X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=global_vars.get('valid_set_fraction')) train_set, valid_set, test_set = makeDummySignalTargets( X_train, y_train, X_val, y_val, X_test, y_test) return train_set, valid_set, test_set
def feature_importance_minmax_report(model, dataset, folder_name): FEATURE_VALUES = {} report_file_name = f'{folder_name}/{global_vars.get("report")}_{global_vars.get("explainer")}.pdf' train_data = np_to_var(dataset['train'].X[:, :, :, None]) model.cpu() if 'Ensemble' in type(model).__name__: for mod in model.models: if 'Ensemble' in type(mod).__name__: for inner_mod in mod.models: inner_mod.cpu() inner_mod.eval() mod.cpu() mod.eval() e = globals()[f'{global_vars.get("explainer")}_explainer'](model, train_data) shap_imgs = [] # for segment in ['train', 'test', 'both']: for segment in ['test']: if segment == 'both': dataset = unify_dataset(dataset) segment_data = np_to_var(dataset.X[:, :, :, None]) else: segment_data = np_to_var(dataset[segment].X[:, :, :, None]) min_example_idx = np.where(dataset[segment].y.max(axis=1) == np.amin(dataset[segment].y.max(axis=1)))[0] max_example_idx = np.where(dataset[segment].y.max(axis=1) == np.amax(dataset[segment].y.max(axis=1)))[0] min_example = segment_data[min_example_idx] max_example = segment_data[max_example_idx] min_feature_values = e.get_feature_importance(min_example) max_feature_values = e.get_feature_importance(max_example) min_feature_val = np.array(min_feature_values).squeeze() max_feature_val = np.array(max_feature_values).squeeze() np.save(f'{folder_name}/{global_vars.get("explainer")}_{segment}_min.npy', min_feature_val) np.save(f'{folder_name}/{global_vars.get("explainer")}_{segment}_max.npy', max_feature_val) for segment in ['test']: min_img_file = plot_feature_importance_netflow(folder_name, min_feature_val, global_vars.get('start_hour'), global_vars.get('dataset'), segment, global_vars.get('explainer'), title='min') max_img_file = plot_feature_importance_netflow(folder_name, max_feature_val, global_vars.get('start_hour'), global_vars.get('dataset'), segment, global_vars.get('explainer'), title='max') shap_imgs.append(min_img_file) shap_imgs.append(max_img_file) story = [] for im in shap_imgs: story.append(get_image(im)) create_pdf_from_story(report_file_name, story) global_vars.get('sacred_ex').add_artifact(report_file_name) for im in shap_imgs: os.remove(im) gc.collect() return FEATURE_VALUES
def get_fold_idxs(AS): if global_vars.get('k_fold_time'): kf = TimeSeriesSplit(n_splits=global_vars.get('n_folds')) else: kf = KFold(n_splits=global_vars.get('n_folds'), shuffle=True) prev_autonomous_systems = global_vars.get('autonomous_systems') global_vars.set('autonomous_systems', [AS]) dataset = get_dataset('all') concat_train_val_sets(dataset) dataset = unify_dataset(dataset) fold_idxs = {i: {} for i in range(global_vars.get('n_folds'))} for fold_num, (train_index, test_index) in enumerate(kf.split(list(range(len(dataset.X))))): fold_idxs[fold_num]['train_idxs'] = train_index fold_idxs[fold_num]['test_idxs'] = test_index global_vars.set('autonomous_systems', prev_autonomous_systems) return fold_idxs
def export_data_to_file(dataset, format, out_folder, classes=None, transpose_time=False, unify=False): create_folder(out_folder) if unify: dataset = unify_dataset(dataset) dataset = {'all': dataset} for segment in dataset.keys(): if classes is None: X_data = [dataset[segment].X] y_data = [dataset[segment].y] class_strs = [''] else: X_data = [] y_data = [] class_strs = [] for class_idx in classes: X_data.append(dataset[segment].X[np.where( dataset[segment].y == class_idx)]) y_data.append(dataset[segment].y[np.where( dataset[segment].y == class_idx)]) class_strs.append(f'_{label_by_idx(class_idx)}') for X, y, class_str in zip(X_data, y_data, class_strs): if transpose_time: X = np.transpose(X, (0, 2, 1)) if format == 'numpy': np.save(f'{out_folder}/X_{segment}{class_str}', X) np.save(f'{out_folder}/y_{segment}{class_str}', y) elif format == 'matlab': X = np.transpose(X, [1, 2, 0]) savemat(f'{out_folder}/X_{segment}{class_str}.mat', {'data': X}) savemat(f'{out_folder}/y_{segment}{class_str}.mat', {'data': y})
def perturbation_report(model, dataset, folder_name): report_file_name = f'{folder_name}/{global_vars.get("report")}_{global_vars.get("band_filter").__name__}.pdf' if os.path.isfile(report_file_name): return eeg_chans = list(range(get_dummy_input().shape[1])) tf_plots = [] dataset = unify_dataset(dataset) for frequency in range(global_vars.get("low_freq"), global_vars.get("high_freq") + 1): single_subj_dataset = deepcopy(dataset) perturbed_data = global_vars.get('band_filter')(single_subj_dataset.X, max(1, frequency - 1), frequency + 1, global_vars.get('frequency')) if global_vars.get('to_matlab'): tensor_to_eeglab(perturbed_data, f'{folder_name}/perturbation_report/frequency_{frequency}_' f'{global_vars.get("band_filter")}.mat') single_subj_dataset.X = perturbed_data subj_tfs = [] for eeg_chan in eeg_chans: subj_tfs.append(get_tf_data_efficient(single_subj_dataset.X, eeg_chan, global_vars.get('frequency'))) tf_plots.append(tf_plot(subj_tfs, f'average TF for subject {global_vars.get("subject_id")},' f' frequency {frequency}, {global_vars.get("band_filter").__name__}')) story = [get_image(tf) for tf in tf_plots] create_pdf_from_story(report_file_name, story) for tf in tf_plots: os.remove(tf)