Beispiel #1
0
 def split_into_train_valid_test(self, dataset):
     # First split all sets individually, then merge them
     one_set_splitter = KaggleTrainValidTestSplitter(
         use_test_as_valid=self.use_test_as_valid)
     splitted_sets = [one_set_splitter.split_into_train_valid_test(s) 
         for s in dataset.kaggle_sets]
     train_topos = [s['train'].get_topological_view() for s in splitted_sets]
     train_ys = [s['train'].y for s in splitted_sets]
     # create dense design matrix sets
     train_set = DenseDesignMatrixWrapper(
         topo_view=np.concatenate(train_topos, axis=0), 
         y=np.concatenate(train_ys, axis=0), axes=('b','c',0,1))
     
     valid_topos = [s['valid'].get_topological_view() for s in splitted_sets]
     valid_ys = [s['valid'].y for s in splitted_sets]
     valid_set = DenseDesignMatrixWrapper(
         topo_view=np.concatenate(valid_topos, axis=0), 
         y=np.concatenate(valid_ys, axis=0), axes=('b','c',0,1))
     
     test_topos = [s['test'].get_topological_view() for s in splitted_sets]
     test_ys = [s['test'].y for s in splitted_sets]
     test_set = DenseDesignMatrixWrapper(
         topo_view=np.concatenate(test_topos, axis=0), 
         y=np.concatenate(test_ys, axis=0), axes=('b','c',0,1))
     
     return OrderedDict([('train', train_set),
         ('valid', valid_set), 
         ('test',  test_set)])
Beispiel #2
0
 def split_into_train_valid_test(self, dataset):
     # make into 4d tensors
     i_valid_series = 6
     if self.use_test_as_valid:
         i_valid_series = 7
     X_train = np.concatenate(dataset.train_X_series[:i_valid_series])[:,:,np.newaxis,np.newaxis]
     X_valid = dataset.train_X_series[i_valid_series][:,:,np.newaxis,np.newaxis]
     X_test = dataset.train_X_series[7][:,:,np.newaxis,np.newaxis]
     
     y_train = np.concatenate(dataset.train_y_series[:i_valid_series])
     y_valid = dataset.train_y_series[i_valid_series]
     y_test = dataset.train_y_series[7]
 
     # create dense design matrix sets
     train_set = DenseDesignMatrixWrapper(
         topo_view=X_train, 
         y=y_train, axes=('b','c',0,1))
     valid_set = DenseDesignMatrixWrapper(
         topo_view=X_valid,
         y=y_valid, axes=('b','c',0,1))
     test_set = DenseDesignMatrixWrapper(
          topo_view=X_test,
          y=y_test, axes=('b','c',0,1))
     return OrderedDict([('train', train_set),
         ('valid', valid_set), 
         ('test',  test_set)])
Beispiel #3
0
def split_set_by_indices(dataset, train_fold, valid_fold, test_fold):
        n_trials = dataset.get_topological_view().shape[0]
        # Make sure there are no overlaps and we have all possible trials
        # assigned
        assert np.intersect1d(valid_fold, test_fold).size == 0
        assert np.intersect1d(train_fold, test_fold).size == 0
        assert np.intersect1d(train_fold, valid_fold).size == 0
        assert (set(np.concatenate((train_fold, valid_fold, test_fold))) == 
            set(range(n_trials)))
        
        train_set = DenseDesignMatrixWrapper(
            topo_view=dataset.get_topological_view()[train_fold], 
            y=dataset.y[train_fold], 
            axes=dataset.view_converter.axes)
        valid_set = DenseDesignMatrixWrapper(
            topo_view=dataset.get_topological_view()[valid_fold], 
            y=dataset.y[valid_fold], 
            axes=dataset.view_converter.axes)
        test_set = DenseDesignMatrixWrapper(
            topo_view=dataset.get_topological_view()[test_fold], 
            y=dataset.y[test_fold], 
            axes=dataset.view_converter.axes)
        # make ordered dict to make it easier to iterate, i.e. for logging
        datasets = OrderedDict([('train', train_set), ('valid', valid_set), 
                ('test', test_set)])
        return datasets
Beispiel #4
0
    def split_into_train_valid_test(self, sets_container):
        # Otherwise rewrite code should not be too hard..
        assert len(sets_container.sets) > 1, "Expect atleast 2 sets here..."
        # merge all sets before last into one...
        # left out, final set, is test set
        # if we have more than 2 sets use secondlastset as valid set

        # remember python indexing is 0-based..
        i_test_set = len(sets_container.sets) - 1
        # in case of just 2 sets valid will get splitted in train and valid
        # in else clause later
        if self.use_test_as_valid:
            i_valid_set = i_test_set
        else:
            i_valid_set = i_test_set - 1
        if len(sets_container.sets) > 2 or self.use_test_as_valid:
            # can split off train and valid as individual sets
            topos = [s.get_topological_view() for s in
                     sets_container.sets[:i_valid_set]]
            ys = [s.y for s in sets_container.sets[:i_valid_set]]
            train_topo = np.concatenate(topos, axis=0)
            train_y = np.concatenate(ys, axis=0)
            train = DenseDesignMatrixWrapper(
                topo_view=train_topo, y=train_y,
                axes=sets_container.sets[0].view_converter.axes)
            valid = sets_container.sets[i_valid_set]
            test = sets_container.sets[i_test_set]
        else:
            n_folds = int(np.round(1.0 / self.valid_set_fraction))
            # have to split off valid from first=train set
            topos = [s.get_topological_view() for s in
                     sets_container.sets[:i_valid_set + 1]]
            ys = [s.y for s in sets_container.sets[:i_valid_set + 1]]
            full_topo = np.concatenate(topos, axis=0)
            full_y = np.concatenate(ys, axis=0)
            train_valid_set = DenseDesignMatrixWrapper(
                topo_view=full_topo, y=full_y,
                axes=sets_container.sets[0].view_converter.axes)
            train_valid_splitter = CntTrialSingleFoldSplitter(n_folds=n_folds,
                                                              i_test_fold=-1,
                                                              shuffle=False)
            # test fold splitted apart from training is the validation fold...
            train_splitted_sets = (
                train_valid_splitter.split_into_train_valid_test(
                    train_valid_set))
            train = concatenate_sets(train_splitted_sets['train'],
                                     train_splitted_sets['valid'])
            valid = train_splitted_sets['test']
            test = sets_container.sets[i_test_set]
        return OrderedDict(
            [('train', train), ('valid', valid), ('test', test)])
Beispiel #5
0
def create_envelops_per_filterband(iterator, train_set, filterbands):
    env_per_filterband = []
    train_topo = train_set.get_topological_view()
    for low_cut_hz, high_cut_hz in filterbands:
        log.info("Compute filterband from {:.1f} to {:.1f}...".format(
            low_cut_hz, high_cut_hz))
        if low_cut_hz > 0 and high_cut_hz < 125:
            filtered = bandpass_topo(train_topo, low_cut_hz, 
                                     high_cut_hz, sampling_rate=250.0, axis=0, filt_order=4)
        elif low_cut_hz == 0:
            filtered = lowpass_topo(train_topo, high_cut_hz, 
                                sampling_rate=250.0, axis=0, filt_order=4)
        else:
            raise ValueError("Expect to be betweem 0 and 125 Hz, filtering from"
                "{:f} to {:f} Hz.".format(low_cut_hz,high_cut_hz))
        filtered = filtered.astype(np.float32)
        filt_set = DenseDesignMatrixWrapper(topo_view=filtered,y=train_set.y,
                                            axes=train_set.view_converter.axes)
        batches_topo = [b[0] for b in iterator.get_batches(filt_set, shuffle=False)]
        batches_topo = np.concatenate(batches_topo)
        log.info("Compute envelope...")
        env = np.abs(scipy.signal.hilbert(batches_topo, axis=2))
        env = env.astype(np.float32)
        env_per_filterband.append(env)
        
    log.info("Merge into one array...")
    env_per_filterband = np.array(env_per_filterband, dtype=np.float32)
    log.info("Done.")
    return env_per_filterband
def test_preprocessed_splitter():
    class DemeanPreproc():
        """Just for tests :)"""
        def apply(self, dataset, can_fit=False):
            topo_view = dataset.get_topological_view()
            if can_fit:
                self.mean = np.mean(topo_view)
            dataset.set_topological_view(topo_view - self.mean)

    data = np.arange(10)
    dataset = DenseDesignMatrixWrapper(topo_view=to_4d_array(data),
                                       y=np.zeros(10))
    splitter = SingleFoldSplitter(n_folds=10, i_test_fold=9)
    preproc_splitter = PreprocessedSplitter(dataset_splitter=splitter,
                                            preprocessor=DemeanPreproc())

    first_round_sets = preproc_splitter.get_train_valid_test(dataset)

    train_topo = first_round_sets['train'].get_topological_view()
    valid_topo = first_round_sets['valid'].get_topological_view()
    test_topo = first_round_sets['test'].get_topological_view()
    assert np.array_equal(
        train_topo, to_4d_array([-3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5]))
    assert np.array_equal(valid_topo, to_4d_array([4.5]))
    assert np.array_equal(test_topo, to_4d_array([5.5]))

    second_round_set = preproc_splitter.get_train_merged_valid_test(dataset)

    train_topo = second_round_set['train'].get_topological_view()
    valid_topo = second_round_set['valid'].get_topological_view()
    test_topo = second_round_set['test'].get_topological_view()
    assert np.array_equal(train_topo,
                          to_4d_array([-4, -3, -2, -1, 0, 1, 2, 3, 4]))
    assert np.array_equal(valid_topo, to_4d_array([4]))
    assert np.array_equal(test_topo, to_4d_array([5]))
Beispiel #7
0
def test_cnt_windows_iterator_oversample():
    iterator = CntWindowsFromCntIterator(batch_size=3,
                                         input_time_length=6,
                                         n_sample_preds=4,
                                         oversample_targets=True)
    iterator.reset_rng()
    y = np.append(np.zeros((16, 4)), np.ones((4, 4)), axis=0)
    in_set = DenseDesignMatrixWrapper(topo_view=to_4d_time_array(
        range(20)).swapaxes(2, 0),
                                      y=y)

    batches = list(iterator.get_batches(in_set, shuffle=True))
    assert 3 == len(batches)
    # Note that 14,20 is oversampled
    assert np.array_equal(
        batches[0][0].squeeze(), [range(
            6, 12), range(10, 16), range(14, 20)])
    assert np.array_equal(
        batches[1][0].squeeze(),
        [range(14, 20), range(14, 20),
         range(14, 20)])
    assert np.array_equal(
        batches[2][0].squeeze(),
        [[18, 19] + range(4), range(14, 20),
         range(2, 8)])
Beispiel #8
0
def split_into_two_sets(full_set, last_set_fraction):
    topo = full_set.get_topological_view()
    y = full_set.y
    n_trials = len(full_set.y)
    
    i_last_first_set_trial = n_trials - 1 - int(last_set_fraction * n_trials)
    
    first_set = DenseDesignMatrixWrapper(
        topo_view=topo[0:i_last_first_set_trial+1], 
        y=y[0:i_last_first_set_trial+1], 
        axes=full_set.view_converter.axes)
    second_set = DenseDesignMatrixWrapper(
        topo_view=topo[i_last_first_set_trial+1:], 
        y=y[i_last_first_set_trial+1:], 
        axes=full_set.view_converter.axes)
    
    return first_set, second_set
def test_fixed_trial_with_rounding():
    dataset = DenseDesignMatrixWrapper(topo_view=to_4d_array(range(12)),
                                       y=np.zeros(12))
    splitter = FixedTrialSplitter(n_train_trials=9, valid_set_fraction=0.2)
    sets = splitter.split_into_train_valid_test(dataset)
    assert np.array_equal(sets['train'].get_topological_view().squeeze(),
                          range(8))
    assert sets['valid'].get_topological_view().squeeze() == 8
    assert np.array_equal(sets['test'].get_topological_view().squeeze(),
                          range(9, 12))
Beispiel #10
0
def split_sets_after_preproc_merge(full_set, split_index, split_to_end_num):
    """Assumes that full set may be doubled or tripled in size
    and split index refers to original size. So
    if we originally had 100 trials (set1) + 20 trials (set2)
    merged to 120 trials, we get a split index of 100.
    If we later have 360 trials we assume that the 360 trials
    consist of:
    100 trials set1 + 20 trials set2 + 100 trials set1 + 20 trials set2
    + 100 trials set1 + 20 trials set2
    (and not 300 trials set1 + 60 trials set2)
    """
    full_topo = full_set.get_topological_view()
    full_y = full_set.y
    original_full_len = split_index + split_to_end_num
    topo_first = full_topo[:split_index]
    y_first = full_y[:split_index]
    topo_second = full_topo[split_index:original_full_len]
    y_second = full_y[split_index:original_full_len]
    next_start = original_full_len
    # Go through possibly appended transformed copies of dataset
    # If preprocessors did not change dataset size, this is not 
    # necessary
    for next_split in xrange(next_start + split_index, 
            len(full_set.y), original_full_len):
        assert False, "Please check/test this code again if you need it"
        next_end = next_split + split_to_end_num
        topo_first = np.concatenate((topo_first, 
            full_topo[next_start:next_split]))
        y_first = np.concatenate((y_first, full_y[next_start:next_split]))
        topo_second = np.concatenate((topo_second, 
            full_topo[next_split:next_end]))
        y_second =  np.concatenate((y_second, full_y[next_split:next_end]))
        next_start = next_end
    first_set = DenseDesignMatrixWrapper(
        topo_view=topo_first,
        y=y_first,
        axes=full_set.view_converter.axes)
    second_set = DenseDesignMatrixWrapper(
        topo_view=topo_second,
        y=y_second,
        axes=full_set.view_converter.axes)
    return first_set, second_set
def test_first_fold():
    data = np.arange(10)
    dataset = DenseDesignMatrixWrapper(topo_view=to_4d_array(data),
                                       y=np.zeros(10))
    splitter = SingleFoldSplitter(n_folds=10, i_test_fold=0)
    datasets = splitter.split_into_train_valid_test(dataset)

    assert np.array_equal(to_4d_array(np.arange(1, 9)),
                          datasets['train'].get_topological_view())
    assert np.array_equal(to_4d_array([9]),
                          datasets['valid'].get_topological_view())
    assert np.array_equal(to_4d_array([0]),
                          datasets['test'].get_topological_view())
def test_repeated_calls_with_shuffle():
    """Repeated calls should always lead to same split"""
    data = np.arange(100)
    dataset = DenseDesignMatrixWrapper(topo_view=to_4d_array(data),
                                       y=np.zeros(100))
    splitter = SingleFoldSplitter(n_folds=10, i_test_fold=9, shuffle=True)
    reference_datasets = splitter.split_into_train_valid_test(dataset)

    # 20 attemptsat splitting should all lead to same datasets!
    for _ in range(20):
        new_datasets = splitter.split_into_train_valid_test(dataset)
        for key in reference_datasets:
            assert np.array_equal(
                reference_datasets[key].get_topological_view(),
                new_datasets[key].get_topological_view())
Beispiel #13
0
def concatenate_sets(first_set, second_set):
    """Concatenates topo views and y(targets)
    """
    # hackily disabled for epilepsy competition
    #assert first_set.view_converter.axes == second_set.view_converter.axes,\
    #    "first set and second set should have same axes ordering"
    #assert first_set.view_converter.axes[0] == 'b', ("Expect batch axis "
    #    "as first axis")
    merged_topo_view = np.concatenate((first_set.get_topological_view(),
        second_set.get_topological_view()), axis=0)
    merged_y = np.concatenate((first_set.y, second_set.y), axis=0) 
    merged_set = DenseDesignMatrixWrapper(
        topo_view=merged_topo_view,
        y=merged_y,
        axes=first_set.view_converter.axes)
    return merged_set
Beispiel #14
0
def test_cnt_windows_iterator_shuffle():
    #Random Regression test, values should not change unless randomization changes...
    iterator = CntWindowsFromCntIterator(batch_size=2,
                                         input_time_length=6,
                                         n_sample_preds=4)
    iterator.reset_rng()

    in_topo = to_4d_time_array(range(20)).swapaxes(2, 0)
    y = np.outer(range(20), np.ones(4))
    in_set = DenseDesignMatrixWrapper(topo_view=in_topo, y=y)

    batches = list(iterator.get_batches(in_set, shuffle=True))
    assert 2 == len(batches)
    assert np.array_equal([range(6, 12), [18, 19] + range(4),
                           range(10, 16)], batches[0][0].squeeze())
    assert np.array_equal(
        np.outer([8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15], np.ones(4)),
        batches[0][1])
    assert np.array_equal([range(14, 20), range(2, 8)],
                          batches[1][0].squeeze())
    assert np.array_equal(np.outer([16, 17, 18, 19, 4, 5, 6, 7], np.ones(4)),
                          batches[1][1])
Beispiel #15
0
def test_cnt_windows_iterator():
    iterator = CntWindowsFromCntIterator(batch_size=2,
                                         input_time_length=6,
                                         n_sample_preds=4)

    in_topo = to_4d_time_array(range(20)).swapaxes(2, 0)
    y = np.outer(range(20), np.ones(4))
    in_set = DenseDesignMatrixWrapper(topo_view=in_topo, y=y)

    batches = list(iterator.get_batches(in_set, shuffle=False))

    assert 2 == len(batches)
    # we have two lost samples so expect wraparound from back
    assert np.array_equal(
        [[18, 19] + range(4), range(2, 8),
         range(6, 12)], batches[0][0].squeeze())
    assert np.array_equal(
        np.outer([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], np.ones(4)),
        batches[0][1])
    assert np.array_equal([range(10, 16), range(14, 20)],
                          batches[1][0].squeeze())
    assert np.array_equal(
        np.outer([12, 13, 14, 15, 16, 17, 18, 19], np.ones(4)), batches[1][1])
Beispiel #16
0
def test_windows_iterator():
    topo_data = [range(i_trial, i_trial + 6) for i_trial in range(3)]
    topo_data = np.array(topo_data)[:, np.newaxis, :, np.newaxis]

    y = np.int32(range(topo_data.shape[0]))
    dataset = DenseDesignMatrixWrapper(topo_view=topo_data,
                                       y=y,
                                       axes=('b', 'c', 0, 1))

    iterator = WindowsIterator(batch_size=7,
                               n_samples_per_window=2,
                               sample_axes_name=0,
                               n_sample_stride=1)

    batches = list(iterator.get_batches(dataset, shuffle=False))
    assert (len(batches) == 2)
    assert np.array_equal((8, 1, 2, 1), np.array(batches[0][0]).shape)
    assert np.array_equal([0, 1], batches[0][0][0].squeeze())
    assert np.array_equal([1, 2], batches[0][0][1].squeeze())
    assert np.array_equal([2, 3], batches[0][0][2].squeeze())
    assert np.array_equal([3, 4], batches[0][0][3].squeeze())
    assert np.array_equal([4, 5], batches[0][0][4].squeeze())
    assert np.array_equal([1, 2], batches[0][0][5].squeeze())
    assert np.array_equal([2, 3], batches[0][0][6].squeeze())
    assert np.array_equal([3, 4], batches[0][0][7].squeeze())

    assert np.array_equal([1, 2], batches[0][0][1].squeeze())

    assert np.array_equal((7, 1, 2, 1), np.array(batches[1][0]).shape)

    assert np.array_equal([4, 5], batches[1][0][0].squeeze())
    assert np.array_equal([5, 6], batches[1][0][1].squeeze())
    assert np.array_equal([2, 3], batches[1][0][2].squeeze())
    assert np.array_equal([3, 4], batches[1][0][3].squeeze())
    assert np.array_equal([4, 5], batches[1][0][4].squeeze())
    assert np.array_equal([5, 6], batches[1][0][5].squeeze())
    assert np.array_equal([6, 7], batches[1][0][6].squeeze())
Beispiel #17
0
def create_submission_csv_for_one_subject(folder_name, kaggle_set, iterator, preprocessor,
        final_layer, submission_id):
    ### Load and preprocess data
    kaggle_set.load()
    # remember test series lengths before and after resampling to more accurately pad predictions
    # later (padding due to the lost samples)
    kaggle_set.load_test_data()
    test_series_lengths = [len(series) for series in kaggle_set.test_X_series] 
    kaggle_set.resample_test_data()
    test_series_lengths_resampled = [len(series) for series in kaggle_set.test_X_series] 
    X_train = deepcopy(np.concatenate(kaggle_set.train_X_series)[:,:,np.newaxis,np.newaxis])
    X_test_0 = deepcopy(kaggle_set.test_X_series[0][:,:,np.newaxis,np.newaxis])
    X_test_1 = deepcopy(kaggle_set.test_X_series[1][:,:,np.newaxis,np.newaxis])

    # create dense design matrix sets
    train_set = DenseDesignMatrixWrapper(
        topo_view=X_train, 
        y=None, axes=('b','c',0,1))
    fake_test_y = np.ones((len(X_test_0), 6))
    test_set_0 = DenseDesignMatrixWrapper(
        topo_view=X_test_0, 
        y=fake_test_y)
    fake_test_y = np.ones((len(X_test_1), 6))
    test_set_1 = DenseDesignMatrixWrapper(
        topo_view=X_test_1, 
        y=fake_test_y)
    log.info("Preprocessing data...")
    preprocessor.apply(train_set, can_fit=True)
    preprocessor.apply(test_set_0, can_fit=False)
    preprocessor.apply(test_set_1, can_fit=False)
    
    ### Create prediction function and create predictions
    log.info("Create prediction functions...")
    input_var = lasagne.layers.get_all_layers(final_layer)[0].input_var
    predictions = lasagne.layers.get_output(final_layer, deterministic=True)
    pred_fn = theano.function([input_var], predictions)
    log.info("Make predictions...")
    batch_gen_0 = iterator.get_batches(test_set_0, shuffle=False)
    all_preds_0 = [pred_fn(batch[0]) for batch in batch_gen_0]
    batch_gen_1 = iterator.get_batches(test_set_1, shuffle=False)
    all_preds_1 = [pred_fn(batch[0]) for batch in batch_gen_1]
    
    ### Pad and reshape predictions
    n_sample_preds = get_n_sample_preds(final_layer)
    input_time_length = lasagne.layers.get_all_layers(final_layer)[0].shape[2]
    
    n_samples_0 = test_set_0.get_topological_view().shape[0]
    preds_arr_0 = get_reshaped_cnt_preds(all_preds_0, n_samples_0, 
        input_time_length, n_sample_preds)
    n_samples_1 = test_set_1.get_topological_view().shape[0]
    preds_arr_1 = get_reshaped_cnt_preds(all_preds_1, n_samples_1, 
        input_time_length, n_sample_preds)

    series_preds = [preds_arr_0, preds_arr_1]
    assert len(series_preds[0]) == test_series_lengths_resampled[0]
    assert len(series_preds[1]) == test_series_lengths_resampled[1]
    assert False, ("TODO: here only duplicate if resample half is true for the dataset.. "
        "also take care how to create submission cv if trained on all subjects")
    series_preds_duplicated = [np.repeat(preds, 2,axis=0) for preds in series_preds]
    n_classes = preds_arr_0.shape[1]
    # pad missing ones with zeros
    missing_0 = test_series_lengths[0] - len(series_preds_duplicated[0])
    full_preds_0 = np.append(np.zeros((missing_0, n_classes), dtype=np.float32), 
                             series_preds_duplicated[0], axis=0)
    missing_1 = test_series_lengths[1] - len(series_preds_duplicated[1])
    full_preds_1 = np.append(np.zeros((missing_1, n_classes), dtype=np.float32),
                             series_preds_duplicated[1], axis=0)
    assert len(full_preds_0) == test_series_lengths[0]
    assert len(full_preds_1) == test_series_lengths[1]

    full_series_preds = [full_preds_0, full_preds_1]
    assert sum([len(a) for a in full_series_preds]) == np.sum(test_series_lengths)
    
    ### Create csv 

    log.info("Create csv...")
    csv_filename =  "{:02d}".format(submission_id) + '.csv'
    csv_filename = os.path.join(folder_name, csv_filename)
    cols = ['HandStart','FirstDigitTouch',
        'BothStartLoadPhase','LiftOff',
        'Replace','BothReleased']

    # collect ids
    all_ids = []
    all_preds = []
    for i_series in (9,10):
        id_prefix = "subj{:d}_series{:d}_".format(kaggle_set.i_subject, i_series)
        this_preds = full_series_preds[i_series-9] # respect offsets
        all_preds.extend(this_preds)
        this_ids = [id_prefix + str(i_sample) for i_sample in range(this_preds.shape[0])]
        all_ids.extend(this_ids)
    all_ids = np.array(all_ids)
    all_preds = np.array(all_preds)
    submission = pd.DataFrame(index=all_ids,
                              columns=cols,
                              data=all_preds)

    submission.to_csv(csv_filename, index_label='id',float_format='%.3f')
    log.info("Done")
Beispiel #18
0
def create_submission_csv_for_all_subject_model(folder_name,
                                                all_sub_kaggle_set,
                                                dataset_provider, iterator,
                                                final_layer, submission_id):
    all_sub_kaggle_set.load()
    assert all_sub_kaggle_set.resample_half == False, ("Not implemented for "
                                                       "resample half")
    all_sub_kaggle_set.load_test()
    # following line will just do the preprocessing already on the train set...
    dataset_provider.get_train_merged_valid_test(all_sub_kaggle_set)
    test_sets_per_subj = []
    for i_subject in range(12):
        kaggle_set = all_sub_kaggle_set.kaggle_sets[i_subject]
        this_sets = []
        for i_test_series in range(2):
            # Get input
            X_test = kaggle_set.test_X_series[i_test_series][:, :, np.newaxis,
                                                             np.newaxis]
            fake_test_y = np.ones((len(X_test), 6))
            test_set = DenseDesignMatrixWrapper(topo_view=X_test,
                                                y=fake_test_y)
            if dataset_provider.preprocessor is not None:
                dataset_provider.preprocessor.apply(test_set, can_fit=False)
            this_sets.append(test_set)
        assert len(this_sets) == 2
        test_sets_per_subj.append(this_sets)

    ### Create prediction function and create predictions
    log.info("Create prediction functions...")
    input_var = lasagne.layers.get_all_layers(final_layer)[0].input_var
    predictions = lasagne.layers.get_output(final_layer, deterministic=True)
    pred_fn = theano.function([input_var], predictions)
    log.info("Setup iterator...")
    n_sample_preds = get_n_sample_preds(final_layer)
    iterator.n_sample_preds = n_sample_preds
    log.info("Make predictions...")
    preds_per_subject = []
    for i_subject in range(12):
        log.info("Predictions for Subject {:d}...".format(i_subject + 1))
        test_sets_subj = test_sets_per_subj[i_subject]
        preds = get_y_for_subject(pred_fn, test_sets_subj[0],
                                  test_sets_subj[1], iterator, final_layer)
        preds_per_subject.append(preds)
    log.info("Done")
    log.info("Create csv...")
    cols = [
        'HandStart', 'FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff',
        'Replace', 'BothReleased'
    ]
    # collect ids
    all_ids = []
    all_preds = []
    for i_subject in range(12):
        pred_subj_per_series = preds_per_subject[i_subject]
        for i_series in (9, 10):
            id_prefix = "subj{:d}_series{:d}_".format(i_subject + 1, i_series)
            this_preds = pred_subj_per_series[i_series - 9]  # respect offsets
            all_preds.extend(this_preds)
            this_ids = [
                id_prefix + str(i_sample)
                for i_sample in range(this_preds.shape[0])
            ]
            all_ids.extend(this_ids)

    all_ids = np.array(all_ids)
    all_preds = np.array(all_preds)
    assert all_ids.shape == (3144171, )
    assert all_preds.shape == (3144171, 6)
    submission = pd.DataFrame(index=all_ids, columns=cols, data=all_preds)

    csv_output = StringIO.StringIO()
    submission.to_csv(csv_output, index_label='id', float_format='%.3f')
    csv_str = csv_output.getvalue()

    log.info("Create zip...")
    zip_file_name = os.path.join(folder_name, "{:d}.zip".format(submission_id))
    submission_zip_file = ZipFile(zip_file_name, 'w', ZIP_DEFLATED)
    submission_zip_file.writestr("submission.csv", csv_str)
    submission_zip_file.close()
    log.info("Done")
Beispiel #19
0
def create_submission_csv_for_one_subject(folder_name, kaggle_set, iterator,
                                          preprocessor, final_layer,
                                          submission_id):
    ### Load and preprocess data
    kaggle_set.load()
    # remember test series lengths before and after resampling to more accurately pad predictions
    # later (padding due to the lost samples)
    kaggle_set.load_test_data()
    test_series_lengths = [len(series) for series in kaggle_set.test_X_series]
    kaggle_set.resample_test_data()
    test_series_lengths_resampled = [
        len(series) for series in kaggle_set.test_X_series
    ]
    X_train = deepcopy(
        np.concatenate(kaggle_set.train_X_series)[:, :, np.newaxis,
                                                  np.newaxis])
    X_test_0 = deepcopy(kaggle_set.test_X_series[0][:, :, np.newaxis,
                                                    np.newaxis])
    X_test_1 = deepcopy(kaggle_set.test_X_series[1][:, :, np.newaxis,
                                                    np.newaxis])

    # create dense design matrix sets
    train_set = DenseDesignMatrixWrapper(topo_view=X_train,
                                         y=None,
                                         axes=('b', 'c', 0, 1))
    fake_test_y = np.ones((len(X_test_0), 6))
    test_set_0 = DenseDesignMatrixWrapper(topo_view=X_test_0, y=fake_test_y)
    fake_test_y = np.ones((len(X_test_1), 6))
    test_set_1 = DenseDesignMatrixWrapper(topo_view=X_test_1, y=fake_test_y)
    log.info("Preprocessing data...")
    preprocessor.apply(train_set, can_fit=True)
    preprocessor.apply(test_set_0, can_fit=False)
    preprocessor.apply(test_set_1, can_fit=False)

    ### Create prediction function and create predictions
    log.info("Create prediction functions...")
    input_var = lasagne.layers.get_all_layers(final_layer)[0].input_var
    predictions = lasagne.layers.get_output(final_layer, deterministic=True)
    pred_fn = theano.function([input_var], predictions)
    log.info("Make predictions...")
    batch_gen_0 = iterator.get_batches(test_set_0, shuffle=False)
    all_preds_0 = [pred_fn(batch[0]) for batch in batch_gen_0]
    batch_gen_1 = iterator.get_batches(test_set_1, shuffle=False)
    all_preds_1 = [pred_fn(batch[0]) for batch in batch_gen_1]

    ### Pad and reshape predictions
    n_sample_preds = get_n_sample_preds(final_layer)
    input_time_length = lasagne.layers.get_all_layers(final_layer)[0].shape[2]

    n_samples_0 = test_set_0.get_topological_view().shape[0]
    preds_arr_0 = get_reshaped_cnt_preds(all_preds_0, n_samples_0,
                                         input_time_length, n_sample_preds)
    n_samples_1 = test_set_1.get_topological_view().shape[0]
    preds_arr_1 = get_reshaped_cnt_preds(all_preds_1, n_samples_1,
                                         input_time_length, n_sample_preds)

    series_preds = [preds_arr_0, preds_arr_1]
    assert len(series_preds[0]) == test_series_lengths_resampled[0]
    assert len(series_preds[1]) == test_series_lengths_resampled[1]
    assert False, (
        "TODO: here only duplicate if resample half is true for the dataset.. "
        "also take care how to create submission cv if trained on all subjects"
    )
    series_preds_duplicated = [
        np.repeat(preds, 2, axis=0) for preds in series_preds
    ]
    n_classes = preds_arr_0.shape[1]
    # pad missing ones with zeros
    missing_0 = test_series_lengths[0] - len(series_preds_duplicated[0])
    full_preds_0 = np.append(np.zeros((missing_0, n_classes),
                                      dtype=np.float32),
                             series_preds_duplicated[0],
                             axis=0)
    missing_1 = test_series_lengths[1] - len(series_preds_duplicated[1])
    full_preds_1 = np.append(np.zeros((missing_1, n_classes),
                                      dtype=np.float32),
                             series_preds_duplicated[1],
                             axis=0)
    assert len(full_preds_0) == test_series_lengths[0]
    assert len(full_preds_1) == test_series_lengths[1]

    full_series_preds = [full_preds_0, full_preds_1]
    assert sum([len(a)
                for a in full_series_preds]) == np.sum(test_series_lengths)

    ### Create csv

    log.info("Create csv...")
    csv_filename = "{:02d}".format(submission_id) + '.csv'
    csv_filename = os.path.join(folder_name, csv_filename)
    cols = [
        'HandStart', 'FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff',
        'Replace', 'BothReleased'
    ]

    # collect ids
    all_ids = []
    all_preds = []
    for i_series in (9, 10):
        id_prefix = "subj{:d}_series{:d}_".format(kaggle_set.i_subject,
                                                  i_series)
        this_preds = full_series_preds[i_series - 9]  # respect offsets
        all_preds.extend(this_preds)
        this_ids = [
            id_prefix + str(i_sample)
            for i_sample in range(this_preds.shape[0])
        ]
        all_ids.extend(this_ids)
    all_ids = np.array(all_ids)
    all_preds = np.array(all_preds)
    submission = pd.DataFrame(index=all_ids, columns=cols, data=all_preds)

    submission.to_csv(csv_filename, index_label='id', float_format='%.3f')
    log.info("Done")
def test_experiment_sample_windows():
    data_rng = RandomState(398765905)
    rand_topo = data_rng.rand(200, 10, 10, 3).astype(np.float32)
    rand_y = np.int32(data_rng.rand(200) > 0.5)
    rand_topo[rand_y == 1] += 0.1
    rand_set = DenseDesignMatrixWrapper(topo_view=rand_topo, y=rand_y)

    lasagne.random.set_rng(RandomState(9859295))
    in_layer = InputLayer(shape=[None, 10, 5, 3])
    network = DenseLayer(incoming=in_layer,
                         name='softmax',
                         num_units=2,
                         nonlinearity=lasagne.nonlinearities.softmax)
    updates_modifier = MaxNormConstraint({'softmax': 0.5})

    dataset = rand_set

    dataset_iterator = WindowsIterator(n_samples_per_window=5, batch_size=60)

    preprocessor = OnlineAxiswiseStandardize(axis=['c', 1])
    dataset_splitter = FixedTrialSplitter(n_train_trials=150,
                                          valid_set_fraction=0.1)
    updates_var_func = lasagne.updates.adam
    loss_var_func = lasagne.objectives.categorical_crossentropy
    monitors = [
        braindecode.veganlasagne.monitors.LossMonitor(),
        braindecode.veganlasagne.monitors.WindowMisclassMonitor(),
        braindecode.veganlasagne.monitors.RuntimeMonitor()
    ]
    stop_criterion = braindecode.veganlasagne.stopping.MaxEpochs(num_epochs=5)

    exp = Experiment(network,
                     dataset,
                     dataset_splitter,
                     preprocessor,
                     dataset_iterator,
                     loss_var_func,
                     updates_var_func,
                     updates_modifier,
                     monitors,
                     stop_criterion,
                     remember_best_chan='valid_misclass',
                     run_after_early_stop=True)
    exp.setup()
    exp.run()

    assert np.allclose(
        [0.629630, 0.140741, 0.029630, 0.022222, 0.000000, 0.000000, 0.000000],
        exp.monitor_chans['train_misclass'],
        rtol=1e-4,
        atol=1e-4)
    assert np.allclose(
        [0.400000, 0.133333, 0.066667, 0.000000, 0.000000, 0.000000, 0.000000],
        exp.monitor_chans['valid_misclass'],
        rtol=1e-4,
        atol=1e-4)
    assert np.allclose(
        [0.560000, 0.060000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000],
        exp.monitor_chans['test_misclass'],
        rtol=1e-4,
        atol=1e-4)
    assert np.allclose(
        [1.180485, 0.574264, 0.420023, 0.330909, 0.278569, 0.245692, 0.242845],
        exp.monitor_chans['train_loss'],
        rtol=1e-4,
        atol=1e-4)
    assert np.allclose(
        [1.016782, 0.514049, 0.370485, 0.288948, 0.240913, 0.211189, 0.215967],
        exp.monitor_chans['valid_loss'],
        rtol=1e-4,
        atol=1e-4)
    assert np.allclose(
        [1.031832, 0.504570, 0.352317, 0.269810, 0.223904, 0.196681, 0.197899],
        exp.monitor_chans['test_loss'],
        rtol=1e-4,
        atol=1e-4)
def test_experiment_fixed_split():
    """ Regression test, checking that values have not changed from original run"""
    data_rng = RandomState(398765905)
    rand_topo = data_rng.rand(200, 10, 10, 3).astype(np.float32)
    rand_y = np.int32(data_rng.rand(200) > 0.5)
    rand_topo[rand_y == 1] += 0.01
    rand_set = DenseDesignMatrixWrapper(topo_view=rand_topo, y=rand_y)

    lasagne.random.set_rng(RandomState(9859295))
    in_layer = InputLayer(shape=[None, 10, 10, 3])
    network = DenseLayer(incoming=in_layer,
                         name="softmax",
                         num_units=2,
                         nonlinearity=lasagne.nonlinearities.softmax)

    updates_modifier = MaxNormConstraint({'softmax': 0.5})
    dataset = rand_set

    dataset_iterator = BalancedBatchIterator(batch_size=60)

    preprocessor = OnlineAxiswiseStandardize(axis=['c', 1])
    dataset_splitter = FixedTrialSplitter(n_train_trials=150,
                                          valid_set_fraction=0.1)
    updates_var_func = lasagne.updates.adam
    loss_var_func = lasagne.objectives.categorical_crossentropy
    monitors = [
        braindecode.veganlasagne.monitors.LossMonitor(),
        braindecode.veganlasagne.monitors.MisclassMonitor(),
        braindecode.veganlasagne.monitors.RuntimeMonitor()
    ]
    stop_criterion = braindecode.veganlasagne.stopping.MaxEpochs(num_epochs=30)

    exp = Experiment(network,
                     dataset,
                     dataset_splitter,
                     preprocessor,
                     dataset_iterator,
                     loss_var_func,
                     updates_var_func,
                     updates_modifier,
                     monitors,
                     stop_criterion,
                     remember_best_chan='valid_misclass',
                     run_after_early_stop=True)
    exp.setup()
    exp.run()
    assert np.allclose([
        0.548148, 0.540741, 0.503704, 0.451852, 0.392593, 0.370370, 0.340741,
        0.281481, 0.237037, 0.207407, 0.192593, 0.177778, 0.133333, 0.111111,
        0.111111, 0.103704, 0.096296, 0.088889, 0.088889, 0.081481, 0.074074,
        0.066667, 0.066667, 0.059259, 0.059259, 0.051852, 0.037037, 0.037037,
        0.029630, 0.029630, 0.029630, 0.053333, 0.053333, 0.053333, 0.053333,
        0.040000, 0.040000, 0.026667, 0.026667, 0.026667, 0.026667, 0.033333,
        0.033333, 0.033333, 0.033333, 0.026667, 0.020000, 0.020000, 0.020000
    ],
                       exp.monitor_chans['train_misclass'],
                       rtol=1e-4,
                       atol=1e-4)
    assert np.allclose([
        0.400000, 0.400000, 0.400000, 0.400000, 0.400000, 0.400000, 0.400000,
        0.400000, 0.333333, 0.333333, 0.333333, 0.266667, 0.266667, 0.266667,
        0.266667, 0.266667, 0.266667, 0.266667, 0.266667, 0.266667, 0.266667,
        0.266667, 0.266667, 0.333333, 0.333333, 0.333333, 0.333333, 0.266667,
        0.266667, 0.266667, 0.266667, 0.266667, 0.266667, 0.266667, 0.266667,
        0.200000, 0.200000, 0.133333, 0.133333, 0.133333, 0.133333, 0.133333,
        0.133333, 0.133333, 0.133333, 0.066667, 0.000000, 0.000000, 0.000000
    ],
                       exp.monitor_chans['valid_misclass'],
                       rtol=1e-4,
                       atol=1e-4)
    assert np.allclose([
        0.460000, 0.420000, 0.420000, 0.420000, 0.420000, 0.440000, 0.420000,
        0.420000, 0.400000, 0.400000, 0.380000, 0.400000, 0.400000, 0.400000,
        0.400000, 0.400000, 0.420000, 0.420000, 0.420000, 0.400000, 0.400000,
        0.400000, 0.380000, 0.380000, 0.380000, 0.380000, 0.400000, 0.400000,
        0.420000, 0.420000, 0.420000, 0.420000, 0.420000, 0.420000, 0.420000,
        0.420000, 0.400000, 0.400000, 0.380000, 0.400000, 0.400000, 0.400000,
        0.400000, 0.400000, 0.360000, 0.360000, 0.380000, 0.380000, 0.380000
    ],
                       exp.monitor_chans['test_misclass'],
                       rtol=1e-4,
                       atol=1e-4)
    assert np.allclose([
        1.200389, 0.777420, 0.740212, 0.705151, 0.672329, 0.641764, 0.613245,
        0.586423, 0.561397, 0.538399, 0.517073, 0.497741, 0.479949, 0.463601,
        0.448505, 0.434583, 0.421652, 0.409739, 0.398721, 0.388490, 0.378988,
        0.370121, 0.361965, 0.354295, 0.347159, 0.340496, 0.334237, 0.328328,
        0.322803, 0.317624, 0.312765, 0.340091, 0.335658, 0.330868, 0.325923,
        0.320895, 0.316027, 0.311290, 0.306683, 0.302364, 0.298264, 0.294475,
        0.290957, 0.287673, 0.284664, 0.281860, 0.279309, 0.276918, 0.274709
    ],
                       exp.monitor_chans['train_loss'],
                       rtol=1e-4,
                       atol=1e-4)
    assert np.allclose([
        0.766092, 0.642237, 0.636960, 0.629884, 0.623676, 0.618789, 0.613821,
        0.609264, 0.605430, 0.601499, 0.598178, 0.594579, 0.591720, 0.589461,
        0.587571, 0.585673, 0.583782, 0.581606, 0.580687, 0.579677, 0.579276,
        0.578903, 0.578918, 0.578901, 0.579020, 0.579575, 0.580291, 0.581120,
        0.581591, 0.582552, 0.583647, 0.585879, 0.582269, 0.571548, 0.555956,
        0.536982, 0.517474, 0.496652, 0.474400, 0.453094, 0.432208, 0.412533,
        0.394271, 0.377036, 0.361311, 0.346461, 0.333406, 0.321266, 0.310158
    ],
                       exp.monitor_chans['valid_loss'],
                       rtol=1e-4,
                       atol=1e-4)
    assert np.allclose([
        1.069603, 0.751982, 0.746711, 0.742126, 0.738055, 0.734703, 0.731921,
        0.729251, 0.727241, 0.724931, 0.723189, 0.721885, 0.720605, 0.719565,
        0.718930, 0.718664, 0.718671, 0.718747, 0.719004, 0.718935, 0.719153,
        0.719381, 0.719815, 0.720419, 0.721205, 0.721993, 0.722759, 0.723534,
        0.724298, 0.724908, 0.725497, 0.725097, 0.725950, 0.726615, 0.726953,
        0.727603, 0.728247, 0.728787, 0.729323, 0.729945, 0.730434, 0.731245,
        0.732168, 0.732949, 0.734086, 0.735250, 0.736381, 0.737502, 0.738444
    ],
                       exp.monitor_chans['test_loss'],
                       rtol=1e-4,
                       atol=1e-4)