Python read_dataset Examples

Programming Language: Python

Namespace/Package Name: utils.data_util

Method/Function: read_dataset

Examples at hotexamples.com: 5

Python read_dataset - 5 examples found. These are the top rated real world Python examples of utils.data_util.read_dataset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: vis_preds.py Project: MFattouh/ieeg-decode

def load_target_trials(dataset_path):
    trials, in_channels = read_dataset(dataset_path, 'D', False)

    trials_idx = list(range(len(trials)))
    _, valid_split = train_test_split(trials_idx,
                                      test_size=0.2,
                                      shuffle=False,
                                      random_state=RANDOM_SEED)
    valid_trials = [trials[valid_idx][1] for valid_idx in valid_split]
    return valid_trials

Example #2

Show file

def main(configs, dataset_dir, subject, log_dir, task):
    with open(configs, 'r') as f:
        merge_configs(yaml.load(f))

    # set the random state
    np.random.seed(cfg.TRAINING.RANDOM_SEED)
    torch.manual_seed(cfg.TRAINING.RANDOM_SEED)
    random.seed(cfg.TRAINING.RANDOM_SEED)

    train_path = os.path.join(log_dir, task.upper(), 'TRAIN', subject,
                              cfg.TRAINING.MODEL.upper())
    assert os.path.exists(
        train_path), f"Can't detect training folder: {train_path}"

    log_dir = os.path.join(log_dir, task.upper(), 'TRAIN', subject,
                           cfg.TRAINING.MODEL.upper())

    if task == 'xpos':
        datasets = glob(dataset_dir + '*' + subject + '_*_xpos.mat')
    elif task == 'xvel':
        datasets = glob(dataset_dir + '*' + subject + '_*_xvel.mat')
    elif task == 'abspos':
        datasets = glob(dataset_dir + '*' + subject + '_*_absPos.mat')
    elif task == 'absvel':
        datasets = glob(dataset_dir + '*' + subject + '_*_absVel.mat')
    else:
        raise KeyError

    assert len(datasets) > 0, 'no datasets for subject %s found!' % subject

    for dataset_path in datasets:
        year, sub, day = osp.basename(dataset_path).split('_')[1:-1]
        trials, in_channels = read_dataset(dataset_path, 'D', False)
        trials_idx = list(range(len(trials)))
        _, valid_split = train_test_split(
            trials_idx,
            test_size=0.2,
            shuffle=False,
            random_state=cfg.TRAINING.RANDOM_SEED)
        valid_trials = [trials[valid_idx] for valid_idx in valid_split]
        rec_name = '_'.join([year, day])

        weights_path = os.path.join(train_path, rec_name, 'weights_final.pt')
        assert os.path.exists(
            weights_path), 'No weights are detected for this recording!'
        model = RNNs(in_channels=in_channels)
        model.load_state_dict(torch.load(weights_path))
        model.cuda()
        model.eval()

        # TODO: hardcoded path. save predictions near the saved weights
        with h5py.File(
                f'/home/fattouhm/notebooks/{year}_{sub}_{day}_{task}_predictions_{WINDOW_SIZE}.h5',
                'w') as hf:
            for trial_idx, trial in enumerate(valid_trials):
                inputs, targets = trial
                time_steps = inputs.shape[1]
                offsets = time_steps - WINDOW_SIZE + 1
                predictions = np.empty((time_steps, offsets), dtype=np.float32)
                predictions[:] = np.nan
                dataset = create_dataset_loader(inputs, targets)
                for batch_offset, (X, _) in enumerate(dataset):
                    with torch.no_grad():
                        X = X.cuda()
                        output = model(X)
                        output = output.detach().squeeze(-1).cpu().numpy()

                    for sample_idx, sample_pred in enumerate(output):
                        offset_idx = batch_offset * 32 + sample_idx
                        predictions[offset_idx:offset_idx + WINDOW_SIZE,
                                    offset_idx] = sample_pred
                hf.create_dataset(f'trial{trial_idx:0>2d}', data=predictions)

    print('Done!')

Example #3

Show file

TASK = 'xpos'
# TASK = 'xvel'

# recording = f'data4DNN_14_PR3_day1_{xpos}.mat'
# recording = f'data4DNN_16_PR3_day1_{TASK}.mat'
# recording = f'data4DNN_15_PR4_day1_{TASK}.mat'
# recording = f'data4DNN_11_FR1_day1_{TASK}.mat'

# recording = f'data4DNN_11_FR3_day2_{TASK}.mat'
# recording = f'data4DNN_11_FR2_day2_{TASK}.mat'
recording = f'data4DNN_17_PR10_day1_{TASK}.mat'

dataset_path = os.path.join(dataset_home, recording)

trials, in_channels = read_dataset(dataset_path, 'D', False)

ecog_dataset = torch.utils.data.ConcatDataset([
    ECoGDatast(X,
               y,
               window=cfg.TRAINING.CROP_LEN,
               stride=cfg.EVAL.INPUT_STRIDE,
               x2y_ratio=cfg.TRAINING.INPUT_SAMPLING_RATE /
               cfg.TRAINING.OUTPUT_SAMPLING_RATE,
               input_shape='ct',
               dummy_idx=cfg.TRAINING.DUMMY_IDX) for (X, y) in trials
])

# %%

# %%

Example #4

Show file

File: connectivity_analysis.py Project: MFattouh/ieeg-decode

def main(command, configs, dataset_dir, subject, log_dir, task):
    with open(configs, 'r') as f:
        merge_configs(yaml.load(f))

    # FIXME: PERTURBATION based connectivity analysis is buggy at the moment
    # set the random state
    np.random.seed(cfg.TRAINING.RANDOM_SEED)
    th.manual_seed(cfg.TRAINING.RANDOM_SEED)
    random.seed(cfg.TRAINING.RANDOM_SEED)

    train_path = os.path.join(log_dir, task.upper(), 'TRAIN', subject,
                              cfg.TRAINING.MODEL.upper())
    assert os.path.exists(
        train_path), f"Can't detect training folder: {train_path}"

    log_dir = os.path.join(log_dir, task.upper(), 'TRAIN', subject,
                           cfg.TRAINING.MODEL.upper())

    if task == 'xpos':
        datasets = glob(dataset_dir + '*' + subject + '_*_xpos.mat')
    elif task == 'xvel':
        datasets = glob(dataset_dir + '*' + subject + '_*_xvel.mat')
    elif task == 'abspos':
        datasets = glob(dataset_dir + '*' + subject + '_*_absPos.mat')
    elif task == 'absvel':
        datasets = glob(dataset_dir + '*' + subject + '_*_absVel.mat')
    elif task == 'xacc':
        datasets = glob(dataset_dir + 'ALL_*' + subject + '_*_xacc.mat')
    elif task == 'absacc':
        datasets = glob(dataset_dir + 'ALL_*' + subject + '_*_absAcc.mat')
    else:
        raise KeyError

    assert len(datasets) > 0, 'no datasets for subject %s found!' % subject
    dummy_idx = 1 if cfg.TRAINING.DUMMY_IDX == 'f' else 3
    # assert cfg.TRAINING.NUM_CLASSES == 1
    for dataset_path in datasets:
        rec_day_name = os.path.basename(dataset_path).split('.')[0].split('_')
        rec_name = '_'.join([rec_day_name[1], rec_day_name[3]])
        dataset_name = cfg.EVAL.DATASET

        trials, in_channels = read_dataset(
            dataset_path,
            dataset_name,
            mha_only=cfg.TRAINING.MHA_CHANNELS_ONLY)
        # TODO: split dataset to training and evaluation sets. Add additional option to set split size
        data_loader = create_eval_loader(trials)
        num_classes = 1

        weights_path = os.path.join(train_path, rec_name, 'weights_final.pt')
        assert os.path.exists(
            weights_path
        ), 'No weights are detected for this recording! in {weights_path}'
        model, _, _, _, _ = create_model(in_channels, num_classes, CUDA)
        if CUDA:
            model.cuda()

        model.load_state_dict(th.load(weights_path))
        # TODO: check if the weights are loaded properly. check the corr of validation set for example.

        # cudnn RNN backward can only be called in training mode
        model.train()
        eval_dropouts(model)

        # mean_autocorr_x = np.zeros(cfg.TRAINING.CROP_LEN, dtype=np.float32)
        # mean_autocorr_y = np.zeros(cfg.TRAINING.CROP_LEN, dtype=np.float32)
        grads = []
        if 'io' in command:
            output_name = 'input'
            #  grads = np.zeros(cfg.TRAINING.CROP_LEN, dtype=np.float32)

        elif 'freq' in command:
            output_name = 'amps'
            fft_freq = np.fft.rfftfreq(cfg.TRAINING.CROP_LEN,
                                       1 / cfg.TRAINING.INPUT_SAMPLING_RATE)
            num_freq_bins = len(fft_freq)
            #  grads = np.zeros(cfg.TRAINING.CROP_LEN // 2 + 1, dtype=np.float32)

        elif 'pert' in command:
            output_name = 'pert'
            rng = np.random.RandomState(cfg.TRAINING.RANDOM_SEED)
            perturbation_list = []
            output_diff_list = []

        elif 'spectrogram' in command:
            output_name = 'spectrogram'
            window_size = 250
            #             overlap = 125
            overlap = 245
            unique = window_size - overlap
            han = th.tensor(np.hanning(window_size),
                            requires_grad=False,
                            dtype=th.float)

            # we have now 3 dimensions
            # 1. batch (average over samples)
            # 2. fft amps
            # 3. time (4000 / 250) (concatanate grads w.r.t. amps along the x axis)
            fft_freq = np.fft.rfftfreq(window_size,
                                       1 / cfg.TRAINING.INPUT_SAMPLING_RATE)
            num_freq_bins = len(fft_freq)
            time_bins = list(
                range(0, cfg.TRAINING.CROP_LEN - window_size, unique))

            #             grads = np.zeros((len(time_bins), int(num_freq_bins)), dtype=np.float32)
            grads = []

        else:
            raise RuntimeError('command not understood!')

        for X, Y in data_loader:
            # autocorr_x = np.zeros(cfg.TRAINING.CROP_LEN, dtype=np.float32)
            # for c in range(X.shape[2]):
            #     autocorr_x += np.correlate(X[0, 0, c, :], X[0, 0, c, :], 'full')[cfg.TRAINING.CROP_LEN-1:]
            # mean_autocorr_x += autocorr_x / X.shape[2]
            # mean_autocorr_y += np.correlate(Y.squeeze(), Y.squeeze(), 'full')[cfg.TRAINING.CROP_LEN-1:]

            if 'freq' in command:
                # grads w.r.t. frequency amp
                amps_th, iffted = fb_fft(X.squeeze(dummy_idx),
                                         cfg.TRAINING.CROP_LEN)
                model.zero_grad()
                output = model(iffted.unsqueeze(dummy_idx))
                # This is Robin's approach
                # output.mean().backward()
                # this is my approach
                output[:, -1, 0].backward(
                    th.ones(iffted.shape[0], device=iffted.device))
                # grads.append(th.mean(th.abs(amps_th.grad), dim=1).cpu().numpy())
                grads.append(amps_th.grad.cpu().numpy())
                assert grads[-1].ndim == 3
                assert grads[-1].shape[-1] == num_freq_bins

            elif 'spectrogram' in command:
                # time-resolved grads w.r.t frequency amp
                window_grads = []
                for i in time_bins:
                    window = X[:, :, :, i:i + window_size] * han
                    amps_th, iffted = fb_fft(window, window_size)
                    rest_after = th.tensor(X[:, :, :, i + window_size:],
                                           requires_grad=False,
                                           dtype=th.float,
                                           device='cuda')
                    if i > 0:
                        rest_before = th.tensor(X[:, :, :, :i],
                                                requires_grad=False,
                                                dtype=th.float,
                                                device='cuda')
                        input_tensor = th.cat(
                            (rest_before, iffted, rest_after), dim=3)
                    else:
                        input_tensor = th.cat((iffted, rest_after), dim=3)

                    model.zero_grad()
                    output = model(input_tensor)
                    output[:, -1, 0].backward(
                        th.ones(input_tensor.shape[0],
                                device=input_tensor.device))
                    window_grads.append(
                        np.expand_dims(th.mean(th.abs(
                            amps_th.grad.squeeze(dummy_idx)),
                                               dim=1).cpu().numpy(),
                                       axis=1))

                #                 grads += np.vstack(window_grads)
                grads.append(np.hstack(window_grads))
                assert grads[-1].shape[0] == input_tensor.shape[0]
                assert grads[-1].shape[1] == len(time_bins)
                assert grads[-1].shape[-1] == num_freq_bins

            elif 'io' in command:
                # grads w.r.t. input
                input_tensor = th.tensor(X,
                                         requires_grad=True,
                                         dtype=th.float,
                                         device='cuda')
                model.zero_grad()
                output = model(input_tensor)
                output[:, -1, 0].backward(
                    th.ones(input_tensor.shape[0], device=input_tensor.device))
                # channels dimension
                grads.append(
                    input_tensor.grad.squeeze(dummy_idx).cpu().numpy())
                assert grads[-1].ndim == 3
                assert grads[-1].shape[-1] == cfg.TRAINING.CROP_LEN

            elif 'pert' in command:
                # grads w.r.t. input
                # find the model output given the input before perturbation
                with th.no_grad():
                    input_tensor = th.tensor(X, dtype=th.float, device='cuda')
                    model.zero_grad()
                    output_before_pert = model(
                        input_tensor).detach().cpu().numpy()[0, -1, 0]
                    # perturb the input signal and find the output
                    for _ in range(1000):
                        amps_th, iffted, pert_values = fb_fft_with_perturbation(
                            X,
                            amp_perturbation_additive,
                            cfg.TRAINING.CROP_LEN,
                            rng=rng)
                        output_after_pert = model(
                            iffted).detach().cpu().numpy()[0, -1, 0]
                        # append perturbations and output diff from all pert. iterations and mini-batches
                        output_diff_list.append(output_after_pert -
                                                output_before_pert)
                        perturbation_list.append(
                            np.expand_dims(pert_values.squeeze(), 2))
            else:
                raise RuntimeError('command not understood!')

        if 'pert' not in command:
            #             grads /= len(data_loader) * cfg.TRAINING.BATCH_SIZE
            grads_array = np.vstack(grads)
            assert grads_array.shape[0] == len(data_loader.dataset)
            # np.savetxt(f'grads_{cfg.TRAINING.MODEL}_{task}_{subject}_{rec_name}.csv', grads_array, delimiter=',')
            # np.save(f'grads_{cfg.TRAINING.MODEL}_{task}_{subject}_{rec_name}_NEW2.csv', grads_array)
            grads = stat_fn(np.abs(grads_array), axis=(0, 1))
            if 'spectrogram' in command:
                assert grads.ndim == 2
                assert grads.shape[0] == len(time_bins)
                assert grads.shape[1] == num_freq_bins
            elif 'freq' in command:
                assert grads.ndim == 1
                assert grads.shape[0] == num_freq_bins
            else:
                assert grads.ndim == 1
                assert grads.shape[0] == cfg.TRAINING.CROP_LEN
        else:
            output_diff = np.array(output_diff_list)
            perturbations = np.dstack(perturbation_list)
            grads = np.mean(np.abs(
                np.array([[
                    corr(output_diff.reshape(1, -1), pert_fb.reshape(1, -1))
                    for pert_fb in perturbation
                ] for perturbation in perturbations])),
                            axis=0).squeeze()

        # mean_autocorr_x /= len(data_loader) * cfg.TRAINING.BATCH_SIZE
        # mean_autocorr_y /= len(data_loader) * cfg.TRAINING.BATCH_SIZE

        np.savetxt(os.path.join(log_dir, rec_name,
                                f"connectivity_{output_name}_{STAT}.csv"),
                   grads,
                   delimiter=',')
        # np.savetxt(os.path.join(log_dir, rec_name, f"connectivity_{output_name}_{STAT}_new2.csv"), grads, delimiter=',')
        # np.savetxt(os.path.join(log_dir, rec_name, 'autocorr_x.csv'), mean_autocorr_x, delimiter=',')
        # np.savetxt(os.path.join(log_dir, rec_name, 'autocorr_y.csv'), mean_autocorr_y, delimiter=',')

    print('Done!')

Example #5

Show file

def main(mode, configs, dataset_dir, subject, log_dir, n_splits, task):
    with open(configs, 'r') as f:
        merge_configs(yaml.load(f))

    subject = subject.strip('\'\"')
    # set the random state
    np.random.seed(cfg.TRAINING.RANDOM_SEED)
    torch.manual_seed(cfg.TRAINING.RANDOM_SEED)
    random.seed(cfg.TRAINING.RANDOM_SEED)

    model_name = cfg.TRAINING.MODEL.upper()
    if 'RNN' in model_name:
        # consider RNNs #layers in log dir
        model_name = f'{cfg.RNNS.RNN.NUM_LAYERS}L_{model_name}'

    if mode == 'eval':
        train_path = parse_log_dir(log_dir, 'train', cfg.TRAINING.SPLIT,
                                   subject, task, model_name)
        assert os.path.exists(
            train_path), f"Can't detect training folder: {train_path}"
        log_dir = parse_log_dir(log_dir, mode, cfg.EVAL.SPLIT, subject, task,
                                model_name)

    else:
        log_dir = parse_log_dir(log_dir, mode, cfg.TRAINING.SPLIT, subject,
                                task, model_name)

    # TODO: get all recordings with the same task from all subjects
    rec_paths = get_rec_paths(dataset_dir, task, subject)
    assert len(rec_paths) > 0, 'no datasets for subject %s found!' % subject
    rec_names = []
    # full rec_names including subject code
    if task == 'multi':
        for rec_path in [rec_path[0] for rec_path in rec_paths]:
            rec_names.append(rec_name_from_path(rec_path))
    else:
        for rec_path in rec_paths:
            rec_names.append(rec_name_from_path(rec_path))

    print('found the following datasets:')
    for rec_path in rec_paths:
        print(rec_path)
    for rec_path, rec_name in zip(rec_paths, rec_names):
        print(f"Working on {rec_path}")
        rec_dir = osp.join(log_dir, rec_name)
        if not osp.exists(rec_dir):
            os.makedirs(rec_dir)

        # setup logging file handler
        setup_logging_handler(root, rec_dir)
        logger.info('Called with configs:')
        logger.info(json.dumps(cfg, indent=2))
        # export the configs as yaml
        with open(osp.join(rec_dir, 'configs.json'), 'w+') as fp:
            json.dump(cfg, fp, indent=2)

        msg = str('Working on dataset %s:' %
                  rec_name if task == 'multi' else rec_path)
        logger.info(msg + '\n' + '=' * len(msg))
        if mode == 'cv' or mode == 'train':
            dataset_name = 'D'
        else:
            dataset_name = cfg.EVAL.DATASET

        # TODO: refactor and simplify
        if task == 'multi':
            trials, in_channels = read_multi_datasets(
                rec_path,
                dataset_name,
                mha_only=cfg.TRAINING.MHA_CHANNELS_ONLY)
            num_classes = len(rec_path)
        else:
            trials, in_channels = read_dataset(
                rec_path,
                dataset_name,
                mha_only=cfg.TRAINING.MHA_CHANNELS_ONLY)
            num_classes = 1
        # TODO: move the logs and check to the read_dataset
        logger.info(f'{len(trials)} trials found')
        logger.info(f'Number of input channels: {in_channels}')
        if in_channels < 1:
            logger.warning(f'Zero valid channels found!!!!!!')
            print(f'Zero valid channels found!!!!!!')
            return

        if mode == 'cv':
            run_cv(n_splits, task, rec_name, rec_dir, trials, in_channels,
                   num_classes)

        elif mode == 'train':
            run_training(task, rec_name, rec_dir, trials, in_channels,
                         num_classes)

        # eval
        else:
            run_eval(task, train_path, rec_name, rec_dir, trials, in_channels,
                     num_classes)

        print(f"Finished working on {rec_path}")

    logger.info('Done!')
    print("Done!")