def main(configs, dataset_dir, subject, log_dir, task): with open(configs, 'r') as f: merge_configs(yaml.load(f)) # set the random state np.random.seed(cfg.TRAINING.RANDOM_SEED) torch.manual_seed(cfg.TRAINING.RANDOM_SEED) random.seed(cfg.TRAINING.RANDOM_SEED) train_path = os.path.join(log_dir, task.upper(), 'TRAIN', subject, cfg.TRAINING.MODEL.upper()) assert os.path.exists( train_path), f"Can't detect training folder: {train_path}" log_dir = os.path.join(log_dir, task.upper(), 'TRAIN', subject, cfg.TRAINING.MODEL.upper()) if task == 'xpos': datasets = glob(dataset_dir + '*' + subject + '_*_xpos.mat') elif task == 'xvel': datasets = glob(dataset_dir + '*' + subject + '_*_xvel.mat') elif task == 'abspos': datasets = glob(dataset_dir + '*' + subject + '_*_absPos.mat') elif task == 'absvel': datasets = glob(dataset_dir + '*' + subject + '_*_absVel.mat') else: raise KeyError assert len(datasets) > 0, 'no datasets for subject %s found!' % subject for dataset_path in datasets: year, sub, day = osp.basename(dataset_path).split('_')[1:-1] trials, in_channels = read_dataset(dataset_path, 'D', False) trials_idx = list(range(len(trials))) _, valid_split = train_test_split( trials_idx, test_size=0.2, shuffle=False, random_state=cfg.TRAINING.RANDOM_SEED) valid_trials = [trials[valid_idx] for valid_idx in valid_split] rec_name = '_'.join([year, day]) weights_path = os.path.join(train_path, rec_name, 'weights_final.pt') assert os.path.exists( weights_path), 'No weights are detected for this recording!' model = RNNs(in_channels=in_channels) model.load_state_dict(torch.load(weights_path)) model.cuda() model.eval() # TODO: hardcoded path. save predictions near the saved weights with h5py.File( f'/home/fattouhm/notebooks/{year}_{sub}_{day}_{task}_predictions_{WINDOW_SIZE}.h5', 'w') as hf: for trial_idx, trial in enumerate(valid_trials): inputs, targets = trial time_steps = inputs.shape[1] offsets = time_steps - WINDOW_SIZE + 1 predictions = np.empty((time_steps, offsets), dtype=np.float32) predictions[:] = np.nan dataset = create_dataset_loader(inputs, targets) for batch_offset, (X, _) in enumerate(dataset): with torch.no_grad(): X = X.cuda() output = model(X) output = output.detach().squeeze(-1).cpu().numpy() for sample_idx, sample_pred in enumerate(output): offset_idx = batch_offset * 32 + sample_idx predictions[offset_idx:offset_idx + WINDOW_SIZE, offset_idx] = sample_pred hf.create_dataset(f'trial{trial_idx:0>2d}', data=predictions) print('Done!')
'legend.fontsize': fontsize + 2, 'axes.titlesize': fontsize + 6, 'xtick.labelsize': fontsize + 2, 'ytick.labelsize': fontsize + 2 } sns.set(rc=rc) sns.set_style('darkgrid') # %% import yaml from utils.config import cfg, merge_configs # configs_file = osp.join(HOME, 'projects', 'ieeg','configurations', 'RNN_500EPOCHS_SEED1.yml') configs_file = osp.join(HOME, 'projects', 'ieeg', 'configurations', 'DEEP4_500EPOCHS_SEED1.yml') with open(configs_file, 'r') as f: merge_configs(yaml.load(f)) print(json.dumps(cfg, indent=2)) # %% from utils.data_util import ECoGDatast, read_dataset TASK = 'xpos' # TASK = 'xvel' # recording = f'data4DNN_14_PR3_day1_{xpos}.mat' # recording = f'data4DNN_16_PR3_day1_{TASK}.mat' # recording = f'data4DNN_15_PR4_day1_{TASK}.mat' # recording = f'data4DNN_11_FR1_day1_{TASK}.mat' # recording = f'data4DNN_11_FR3_day2_{TASK}.mat'
def main(configs, dataset_dir, subject, log_dir, num_layers, n_splits, task): with open(configs, 'r') as f: merge_configs(yaml.load(f)) assert cfg.TRAINING.MODEL == 'RNN' assert cfg.RNNS.RNN.NUM_LAYERS == num_layers # set the random state np.random.seed(cfg.TRAINING.RANDOM_SEED) torch.manual_seed(cfg.TRAINING.RANDOM_SEED) random.seed(cfg.TRAINING.RANDOM_SEED) if num_layers == 1: log_dir = os.path.join(log_dir, task.upper(), subject, '1Layer') else: log_dir = os.path.join(log_dir, task.upper(), subject, str(num_layers) + 'Layers') if not os.path.isdir(log_dir): os.makedirs(log_dir) logging.basicConfig( level=logging.DEBUG, filename=os.path.join(log_dir, 'log.txt'), filemode='w+', format='%(levelname)s %(filename)s:%(lineno)4d: %(message)s') logger.info('Called with configs:') logger.info(json.dumps(cfg, indent=2)) datasets = [] if task == 'xpos': datasets = glob(dataset_dir + '*' + subject + '_*_xpos.mat') elif task == 'xvel': datasets = glob(dataset_dir + '*' + subject + '_*_xvel.mat') elif task == 'abspos': datasets = glob(dataset_dir + '*' + subject + '_*_absPos.mat') elif task == 'absvel': datasets = glob(dataset_dir + '*' + subject + '_*_absVel.mat') elif task == 'xacc': datasets = glob(dataset_dir + 'ALL_*' + subject + '_*_xacc.mat') elif task == 'absacc': datasets = glob(dataset_dir + '*' + subject + '_*_absAcc.mat') else: raise KeyError assert len(datasets) > 0, 'no datasets for subject %s found!' % subject # data frame to hold cv cross. corr. rec_names = [] for dataset_path in datasets: rec_day_name = os.path.basename(dataset_path).split('.')[0].split('_') rec_names.append('_'.join([rec_day_name[1], rec_day_name[3]])) index = pd.MultiIndex.from_product( [rec_names, ['fold%d' % fold for fold in range(1, n_splits + 1)]], names=['day', 'fold']) df = pd.DataFrame(index=index, columns=['corr', 'mse']) for dataset_path, rec_name in zip(datasets, rec_names): msg = str('Working on dataset %s:' % rec_name if task == 'multi' else dataset_path) logger.info(msg + '\n' + '=' * len(msg)) dataset_name = 'D' if task == 'multi': trials, in_channels = read_multi_datasets( dataset_path, dataset_name, mha_only=cfg.TRAINING.MHA_CHANNELS_ONLY) num_classes = len(dataset_path) else: trials, in_channels = read_dataset( dataset_path, dataset_name, mha_only=cfg.TRAINING.MHA_CHANNELS_ONLY) num_classes = 1 logger.info(f'{len(trials)} trials found') logger.info(f'Number of input input channels: {in_channels}') if in_channels < 1: logger.warning(f'Zero valid channels found!!!!!!') print(f'Zero valid channels found!!!!!!') return crop_idx = list(range((len(trials)))) if n_splits > 0: kfold = KFold(n_splits=n_splits, shuffle=False, random_state=cfg.TRAINING.RANDOM_SEED) elif n_splits == -1: kfold = LeaveOneOut() else: raise ValueError(f'Invalid number of splits: {n_splits}') for fold_idx, (train_split, valid_split) in enumerate(kfold.split(crop_idx), 1): model, optimizer, scheduler, loss_fun, metric = create_model( in_channels, num_classes, CUDA) msg = str(f'FOLD{fold_idx}:') logger.info(msg) logger.info('=' * len(msg)) logger.info('Training trials:') logger.info(train_split) logger.info('Validation trials:') logger.info(valid_split) training_writer = SummaryWriter( os.path.join(log_dir, rec_name, 'fold' + str(fold_idx), 'train')) valid_writer = SummaryWriter( os.path.join(log_dir, rec_name, 'fold' + str(fold_idx), 'valid')) weights_path = os.path.join(log_dir, rec_name, 'fold' + str(fold_idx), 'weights.pt') training_trials = [trials[idx] for idx in train_split] valid_trials = [trials[idx] for idx in valid_split] corr, mse = training_loop(model, optimizer, scheduler, loss_fun, metric, training_trials, training_writer, valid_trials, valid_writer, weights_path, cuda=CUDA) if task == 'multi': for task_idx in range(len(corr)): df.loc[(rec_name, 'fold' + str(fold_idx)), TASK_NAMES[task_idx]] = \ corr['Class%d' % task_idx] df.loc[(rec_name, 'fold' + str(fold_idx)), 'mse'] = mse else: df.loc[(rec_name, 'fold' + str(fold_idx)), :] = [corr, mse] # writes every time just in case it couldn't run the complete script df.to_csv(os.path.join(log_dir, 'cv_acc.csv'), index=True) print("Done!")
def main(command, configs, dataset_dir, subject, log_dir, task): with open(configs, 'r') as f: merge_configs(yaml.load(f)) # FIXME: PERTURBATION based connectivity analysis is buggy at the moment # set the random state np.random.seed(cfg.TRAINING.RANDOM_SEED) th.manual_seed(cfg.TRAINING.RANDOM_SEED) random.seed(cfg.TRAINING.RANDOM_SEED) train_path = os.path.join(log_dir, task.upper(), 'TRAIN', subject, cfg.TRAINING.MODEL.upper()) assert os.path.exists( train_path), f"Can't detect training folder: {train_path}" log_dir = os.path.join(log_dir, task.upper(), 'TRAIN', subject, cfg.TRAINING.MODEL.upper()) if task == 'xpos': datasets = glob(dataset_dir + '*' + subject + '_*_xpos.mat') elif task == 'xvel': datasets = glob(dataset_dir + '*' + subject + '_*_xvel.mat') elif task == 'abspos': datasets = glob(dataset_dir + '*' + subject + '_*_absPos.mat') elif task == 'absvel': datasets = glob(dataset_dir + '*' + subject + '_*_absVel.mat') elif task == 'xacc': datasets = glob(dataset_dir + 'ALL_*' + subject + '_*_xacc.mat') elif task == 'absacc': datasets = glob(dataset_dir + 'ALL_*' + subject + '_*_absAcc.mat') else: raise KeyError assert len(datasets) > 0, 'no datasets for subject %s found!' % subject dummy_idx = 1 if cfg.TRAINING.DUMMY_IDX == 'f' else 3 # assert cfg.TRAINING.NUM_CLASSES == 1 for dataset_path in datasets: rec_day_name = os.path.basename(dataset_path).split('.')[0].split('_') rec_name = '_'.join([rec_day_name[1], rec_day_name[3]]) dataset_name = cfg.EVAL.DATASET trials, in_channels = read_dataset( dataset_path, dataset_name, mha_only=cfg.TRAINING.MHA_CHANNELS_ONLY) # TODO: split dataset to training and evaluation sets. Add additional option to set split size data_loader = create_eval_loader(trials) num_classes = 1 weights_path = os.path.join(train_path, rec_name, 'weights_final.pt') assert os.path.exists( weights_path ), 'No weights are detected for this recording! in {weights_path}' model, _, _, _, _ = create_model(in_channels, num_classes, CUDA) if CUDA: model.cuda() model.load_state_dict(th.load(weights_path)) # TODO: check if the weights are loaded properly. check the corr of validation set for example. # cudnn RNN backward can only be called in training mode model.train() eval_dropouts(model) # mean_autocorr_x = np.zeros(cfg.TRAINING.CROP_LEN, dtype=np.float32) # mean_autocorr_y = np.zeros(cfg.TRAINING.CROP_LEN, dtype=np.float32) grads = [] if 'io' in command: output_name = 'input' # grads = np.zeros(cfg.TRAINING.CROP_LEN, dtype=np.float32) elif 'freq' in command: output_name = 'amps' fft_freq = np.fft.rfftfreq(cfg.TRAINING.CROP_LEN, 1 / cfg.TRAINING.INPUT_SAMPLING_RATE) num_freq_bins = len(fft_freq) # grads = np.zeros(cfg.TRAINING.CROP_LEN // 2 + 1, dtype=np.float32) elif 'pert' in command: output_name = 'pert' rng = np.random.RandomState(cfg.TRAINING.RANDOM_SEED) perturbation_list = [] output_diff_list = [] elif 'spectrogram' in command: output_name = 'spectrogram' window_size = 250 # overlap = 125 overlap = 245 unique = window_size - overlap han = th.tensor(np.hanning(window_size), requires_grad=False, dtype=th.float) # we have now 3 dimensions # 1. batch (average over samples) # 2. fft amps # 3. time (4000 / 250) (concatanate grads w.r.t. amps along the x axis) fft_freq = np.fft.rfftfreq(window_size, 1 / cfg.TRAINING.INPUT_SAMPLING_RATE) num_freq_bins = len(fft_freq) time_bins = list( range(0, cfg.TRAINING.CROP_LEN - window_size, unique)) # grads = np.zeros((len(time_bins), int(num_freq_bins)), dtype=np.float32) grads = [] else: raise RuntimeError('command not understood!') for X, Y in data_loader: # autocorr_x = np.zeros(cfg.TRAINING.CROP_LEN, dtype=np.float32) # for c in range(X.shape[2]): # autocorr_x += np.correlate(X[0, 0, c, :], X[0, 0, c, :], 'full')[cfg.TRAINING.CROP_LEN-1:] # mean_autocorr_x += autocorr_x / X.shape[2] # mean_autocorr_y += np.correlate(Y.squeeze(), Y.squeeze(), 'full')[cfg.TRAINING.CROP_LEN-1:] if 'freq' in command: # grads w.r.t. frequency amp amps_th, iffted = fb_fft(X.squeeze(dummy_idx), cfg.TRAINING.CROP_LEN) model.zero_grad() output = model(iffted.unsqueeze(dummy_idx)) # This is Robin's approach # output.mean().backward() # this is my approach output[:, -1, 0].backward( th.ones(iffted.shape[0], device=iffted.device)) # grads.append(th.mean(th.abs(amps_th.grad), dim=1).cpu().numpy()) grads.append(amps_th.grad.cpu().numpy()) assert grads[-1].ndim == 3 assert grads[-1].shape[-1] == num_freq_bins elif 'spectrogram' in command: # time-resolved grads w.r.t frequency amp window_grads = [] for i in time_bins: window = X[:, :, :, i:i + window_size] * han amps_th, iffted = fb_fft(window, window_size) rest_after = th.tensor(X[:, :, :, i + window_size:], requires_grad=False, dtype=th.float, device='cuda') if i > 0: rest_before = th.tensor(X[:, :, :, :i], requires_grad=False, dtype=th.float, device='cuda') input_tensor = th.cat( (rest_before, iffted, rest_after), dim=3) else: input_tensor = th.cat((iffted, rest_after), dim=3) model.zero_grad() output = model(input_tensor) output[:, -1, 0].backward( th.ones(input_tensor.shape[0], device=input_tensor.device)) window_grads.append( np.expand_dims(th.mean(th.abs( amps_th.grad.squeeze(dummy_idx)), dim=1).cpu().numpy(), axis=1)) # grads += np.vstack(window_grads) grads.append(np.hstack(window_grads)) assert grads[-1].shape[0] == input_tensor.shape[0] assert grads[-1].shape[1] == len(time_bins) assert grads[-1].shape[-1] == num_freq_bins elif 'io' in command: # grads w.r.t. input input_tensor = th.tensor(X, requires_grad=True, dtype=th.float, device='cuda') model.zero_grad() output = model(input_tensor) output[:, -1, 0].backward( th.ones(input_tensor.shape[0], device=input_tensor.device)) # channels dimension grads.append( input_tensor.grad.squeeze(dummy_idx).cpu().numpy()) assert grads[-1].ndim == 3 assert grads[-1].shape[-1] == cfg.TRAINING.CROP_LEN elif 'pert' in command: # grads w.r.t. input # find the model output given the input before perturbation with th.no_grad(): input_tensor = th.tensor(X, dtype=th.float, device='cuda') model.zero_grad() output_before_pert = model( input_tensor).detach().cpu().numpy()[0, -1, 0] # perturb the input signal and find the output for _ in range(1000): amps_th, iffted, pert_values = fb_fft_with_perturbation( X, amp_perturbation_additive, cfg.TRAINING.CROP_LEN, rng=rng) output_after_pert = model( iffted).detach().cpu().numpy()[0, -1, 0] # append perturbations and output diff from all pert. iterations and mini-batches output_diff_list.append(output_after_pert - output_before_pert) perturbation_list.append( np.expand_dims(pert_values.squeeze(), 2)) else: raise RuntimeError('command not understood!') if 'pert' not in command: # grads /= len(data_loader) * cfg.TRAINING.BATCH_SIZE grads_array = np.vstack(grads) assert grads_array.shape[0] == len(data_loader.dataset) # np.savetxt(f'grads_{cfg.TRAINING.MODEL}_{task}_{subject}_{rec_name}.csv', grads_array, delimiter=',') # np.save(f'grads_{cfg.TRAINING.MODEL}_{task}_{subject}_{rec_name}_NEW2.csv', grads_array) grads = stat_fn(np.abs(grads_array), axis=(0, 1)) if 'spectrogram' in command: assert grads.ndim == 2 assert grads.shape[0] == len(time_bins) assert grads.shape[1] == num_freq_bins elif 'freq' in command: assert grads.ndim == 1 assert grads.shape[0] == num_freq_bins else: assert grads.ndim == 1 assert grads.shape[0] == cfg.TRAINING.CROP_LEN else: output_diff = np.array(output_diff_list) perturbations = np.dstack(perturbation_list) grads = np.mean(np.abs( np.array([[ corr(output_diff.reshape(1, -1), pert_fb.reshape(1, -1)) for pert_fb in perturbation ] for perturbation in perturbations])), axis=0).squeeze() # mean_autocorr_x /= len(data_loader) * cfg.TRAINING.BATCH_SIZE # mean_autocorr_y /= len(data_loader) * cfg.TRAINING.BATCH_SIZE np.savetxt(os.path.join(log_dir, rec_name, f"connectivity_{output_name}_{STAT}.csv"), grads, delimiter=',') # np.savetxt(os.path.join(log_dir, rec_name, f"connectivity_{output_name}_{STAT}_new2.csv"), grads, delimiter=',') # np.savetxt(os.path.join(log_dir, rec_name, 'autocorr_x.csv'), mean_autocorr_x, delimiter=',') # np.savetxt(os.path.join(log_dir, rec_name, 'autocorr_y.csv'), mean_autocorr_y, delimiter=',') print('Done!')
def main(mode, configs, dataset_dir, subject, log_dir, n_splits, task): with open(configs, 'r') as f: merge_configs(yaml.load(f)) subject = subject.strip('\'\"') # set the random state np.random.seed(cfg.TRAINING.RANDOM_SEED) torch.manual_seed(cfg.TRAINING.RANDOM_SEED) random.seed(cfg.TRAINING.RANDOM_SEED) model_name = cfg.TRAINING.MODEL.upper() if 'RNN' in model_name: # consider RNNs #layers in log dir model_name = f'{cfg.RNNS.RNN.NUM_LAYERS}L_{model_name}' if mode == 'eval': train_path = parse_log_dir(log_dir, 'train', cfg.TRAINING.SPLIT, subject, task, model_name) assert os.path.exists( train_path), f"Can't detect training folder: {train_path}" log_dir = parse_log_dir(log_dir, mode, cfg.EVAL.SPLIT, subject, task, model_name) else: log_dir = parse_log_dir(log_dir, mode, cfg.TRAINING.SPLIT, subject, task, model_name) # TODO: get all recordings with the same task from all subjects rec_paths = get_rec_paths(dataset_dir, task, subject) assert len(rec_paths) > 0, 'no datasets for subject %s found!' % subject rec_names = [] # full rec_names including subject code if task == 'multi': for rec_path in [rec_path[0] for rec_path in rec_paths]: rec_names.append(rec_name_from_path(rec_path)) else: for rec_path in rec_paths: rec_names.append(rec_name_from_path(rec_path)) print('found the following datasets:') for rec_path in rec_paths: print(rec_path) for rec_path, rec_name in zip(rec_paths, rec_names): print(f"Working on {rec_path}") rec_dir = osp.join(log_dir, rec_name) if not osp.exists(rec_dir): os.makedirs(rec_dir) # setup logging file handler setup_logging_handler(root, rec_dir) logger.info('Called with configs:') logger.info(json.dumps(cfg, indent=2)) # export the configs as yaml with open(osp.join(rec_dir, 'configs.json'), 'w+') as fp: json.dump(cfg, fp, indent=2) msg = str('Working on dataset %s:' % rec_name if task == 'multi' else rec_path) logger.info(msg + '\n' + '=' * len(msg)) if mode == 'cv' or mode == 'train': dataset_name = 'D' else: dataset_name = cfg.EVAL.DATASET # TODO: refactor and simplify if task == 'multi': trials, in_channels = read_multi_datasets( rec_path, dataset_name, mha_only=cfg.TRAINING.MHA_CHANNELS_ONLY) num_classes = len(rec_path) else: trials, in_channels = read_dataset( rec_path, dataset_name, mha_only=cfg.TRAINING.MHA_CHANNELS_ONLY) num_classes = 1 # TODO: move the logs and check to the read_dataset logger.info(f'{len(trials)} trials found') logger.info(f'Number of input channels: {in_channels}') if in_channels < 1: logger.warning(f'Zero valid channels found!!!!!!') print(f'Zero valid channels found!!!!!!') return if mode == 'cv': run_cv(n_splits, task, rec_name, rec_dir, trials, in_channels, num_classes) elif mode == 'train': run_training(task, rec_name, rec_dir, trials, in_channels, num_classes) # eval else: run_eval(task, train_path, rec_name, rec_dir, trials, in_channels, num_classes) print(f"Finished working on {rec_path}") logger.info('Done!') print("Done!")