Exemple #1
0
def _test_mcasey20120222():  # pragma: no cover
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2012q1/002034.html

    # This one is conditioned on allowing # of samples to be changed
    # by the mapper provided to MappedClassifier.  See
    # https://github.com/yarikoptic/PyMVPA/tree/_tent/allow_ch_nsamples

    import numpy as np
    from mvpa2.datasets.base import dataset_wizard
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.mappers.base import ChainMapper
    from mvpa2.mappers.svd import SVDMapper
    from mvpa2.mappers.fx import mean_group_sample
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import MappedClassifier
    from mvpa2.measures.base import CrossValidation

    mapper = ChainMapper([mean_group_sample(['targets','chunks']),
                          SVDMapper()])
    clf = MappedClassifier(LinearCSVMC(), mapper)
    cvte = CrossValidation(clf, NFoldPartitioner(),
                           enable_ca=['repetition_results', 'stats'])

    ds = dataset_wizard(
        samples=np.arange(32).reshape((8, -1)),
        targets=[1, 1, 2, 2, 1, 1, 2, 2],
        chunks=[1, 1, 1, 1, 2, 2, 2, 2])

    errors = cvte(ds)
Exemple #2
0
def main(subject, study_dir, mask, suffix='_stim2'):
    from mvpa2.mappers.zscore import zscore
    from mvpa2.mappers.fx import mean_group_sample
    from wikisim import mvpa

    # load subject data
    sp = su.SubjPath(subject, study_dir)
    vols = task.prex_vols(sp.path('behav', 'log'))

    # load fmri data
    ds = mvpa.load_prex_beta(sp, suffix, mask, verbose=1)

    # zscore
    ds.sa['run'] = vols.run.values
    zscore(ds, chunks_attr='run')

    # average over item presentations
    ds.sa['itemno'] = vols.itemno.to_numpy()
    m = mean_group_sample(['itemno'])
    dsm = ds.get_mapped(m)
    m_vols = vols.groupby('itemno', as_index=False).mean()

    # save data samples and corresponding volume information
    res_dir = os.path.join(sp.study_dir, 'batch', 'glm', 'prex' + suffix,
                           'roi', mask)
    if not os.path.exists(res_dir):
        os.makedirs(res_dir)
    mat_file = os.path.join(res_dir, f'pattern_{subject}.txt')
    tab_file = os.path.join(res_dir, f'pattern_{subject}.csv')
    np.savetxt(mat_file, dsm.samples)
    m_vols.to_csv(tab_file)
Exemple #3
0
def get_fake_data(nsubjects=20, noise_level=0.2, nbogus_classes=0):
    orig_ds = mean_group_sample(['targets'])(testing_datasets['uni3large'])
    # and creating an additional target which is a composition of the other two, so
    # it should be closer to them than to the left out L2
    classes_data = [
        orig_ds.samples, orig_ds[0].samples + orig_ds[1].samples,
        orig_ds[1].samples + 4 * orig_ds[2].samples
    ]
    classes_targets = list(orig_ds.T) + ['L0+1', 'L1+4*2']
    if nbogus_classes:
        classes_data.append(
            np.zeros((nbogus_classes, classes_data[0].shape[1]), dtype=float))
        classes_targets += ['B%02d' % i for i in xrange(nbogus_classes)]
    proto_ds = dataset_wizard(np.vstack(classes_data), targets=classes_targets)
    ntargets = len(proto_ds.UT)
    dss = []
    for i in xrange(nsubjects):
        R = get_random_rotation(proto_ds.nfeatures)
        ds = dataset_wizard(np.dot(proto_ds.samples, R), targets=proto_ds.T)
        #ds = dataset_wizard(proto_ds.samples, targets=proto_ds.T)
        ds.sa['subjects'] = [i]
        # And select a varying number of features
        ds = ds[:, :np.random.randint(10, ds.nfeatures)]
        # Add some noise
        ds.samples += np.random.normal(size=ds.shape) * noise_level
        dss.append(ds)
    return dss
Exemple #4
0
def test_PDistTargetSimilaritySearchlight():
    # Test ability to use PDistTargetSimilarity in a searchlight
    from mvpa2.testing.datasets import datasets
    from mvpa2.mappers.fx import mean_group_sample
    from mvpa2.mappers.shape import TransposeMapper
    from mvpa2.measures.searchlight import sphere_searchlight
    ds = datasets['3dsmall'][:, :3]
    ds.fa['voxel_indices'] = ds.fa.myspace
    # use chunks values (4 of them) for targets
    ds.sa['targets'] = ds.sa.chunks
    ds = mean_group_sample(['chunks'])(ds)
    tdsm = np.arange(6)
    # We can run on full dataset
    tdcm1 = PDistTargetSimilarity(tdsm)
    a1 = tdcm1(ds)
    assert_array_equal(a1.fa.metrics, ['rho', 'p'])

    tdcm1_rho = PDistTargetSimilarity(tdsm, corrcoef_only=True)
    sl_rho = sphere_searchlight(tdcm1_rho)(ds)
    assert_array_equal(sl_rho.shape, (1, ds.nfeatures))

    # now with both but we need to transpose datasets
    tdcm1_both = PDistTargetSimilarity(tdsm, postproc=TransposeMapper())
    sl_both = sphere_searchlight(tdcm1_both)(ds)
    assert_array_equal(sl_both.shape, (2, ds.nfeatures))
    assert_array_equal(sl_both.sa.metrics, ['rho', 'p'])
    # rho must be exactly the same
    assert_array_equal(sl_both.samples[0], sl_rho.samples[0])
    # just because we are here and we can
    # Actually here for some reason assert_array_lequal gave me a trouble
    assert_true(np.all(sl_both.samples[1] <= 1.0))
    assert_true(np.all(0 <= sl_both.samples[1]))
Exemple #5
0
def test_PDistTargetSimilaritySearchlight():
    # Test ability to use PDistTargetSimilarity in a searchlight
    from mvpa2.testing.datasets import datasets
    from mvpa2.mappers.fx import mean_group_sample
    from mvpa2.mappers.shape import TransposeMapper
    from mvpa2.measures.searchlight import sphere_searchlight
    ds = datasets['3dsmall'][:, :3]
    ds.fa['voxel_indices'] = ds.fa.myspace
    # use chunks values (4 of them) for targets
    ds.sa['targets'] = ds.sa.chunks
    ds = mean_group_sample(['chunks'])(ds)
    tdsm = np.arange(6)
    # We can run on full dataset
    tdcm1 = PDistTargetSimilarity(tdsm)
    a1 = tdcm1(ds)
    assert_array_equal(a1.fa.metrics, ['rho', 'p'])

    tdcm1_rho = PDistTargetSimilarity(tdsm, corrcoef_only=True)
    sl_rho = sphere_searchlight(tdcm1_rho)(ds)
    assert_array_equal(sl_rho.shape, (1, ds.nfeatures))

    # now with both but we need to transpose datasets
    tdcm1_both = PDistTargetSimilarity(tdsm, postproc=TransposeMapper())
    sl_both = sphere_searchlight(tdcm1_both)(ds)
    assert_array_equal(sl_both.shape, (2, ds.nfeatures))
    assert_array_equal(sl_both.sa.metrics, ['rho', 'p'])
    # rho must be exactly the same
    assert_array_equal(sl_both.samples[0], sl_rho.samples[0])
    # just because we are here and we can
    # Actually here for some reason assert_array_lequal gave me a trouble
    assert_true(np.all(sl_both.samples[1] <= 1.0))
    assert_true(np.all(0 <= sl_both.samples[1]))
Exemple #6
0
 def _prepare_ds(self, ds):
     if self.params.sattr is not None:
         mgs = mean_group_sample(attrs=self.params.sattr)
         ds_ = mgs(ds)
     else:
         ds_ = ds.copy()
     return ds_
Exemple #7
0
 def _prepare_ds(self, ds):
     if self.params.sattr is not None:
         mgs = mean_group_sample(attrs=self.params.sattr)
         ds_ = mgs(ds)
     else:
         ds_ = ds.copy()
     return ds_
Exemple #8
0
    def _test_gideon_weird_case(self):
        """'The utter collapse' -- communicated by Peter J. Kohler

        Desire to collapse all samples per each category in training
        and testing sets, thus resulting only in a single
        sample/category per training and per testing.  As it is now,
        CrossValidation on MappedClassifier would not work

        observations: chance distribution obviously gets wide, but
        also gets skewed to anti-learning on nfolds like 4.
        
        """
        from mvpa2.mappers.fx import mean_group_sample
        from mvpa2.clfs.knn import kNN
        clf = kNN()
        print "HERE"
        ds = datasets['uni2large'].copy()
        ds = ds[ds.sa.chunks < 9]
        accs = []
        for i in xrange(10):          # # of random samples
            ds.samples = np.random.randn(*ds.shape)
            if False: # this would have been a native way IF we allowed change of number of samples
                clf2 = MappedClassifier(clf=kNN(), #clf,
                                        mapper=mean_group_sample(['targets', 'partitions']))
                cv = CrossValidation(clf2, NFoldPartitioner(4), postproc=None,
                                     enable_ca=['stats'])
                print cv(ds)
            else:
                from mvpa2.clfs.transerror import ConfusionMatrix
                partitioner = NFoldPartitioner(6)
                meaner = mean_group_sample(['targets', 'partitions'])
                cm = ConfusionMatrix()
                te = TransferMeasure(clf, Splitter('partitions'),
                                     postproc=BinaryFxNode(mean_mismatch_error,
                                                           'targets'),
                                     enable_ca = ['stats']
                                     )
                for part in partitioner.generate(ds):
                    ds_meaned = meaner(part)
                    error = np.asscalar(te(ds_meaned))
                    cm += te.ca.stats
                print i, cm.stats['ACC']
                accs.append(cm.stats['ACC'])
Exemple #9
0
def get_dissim_roi(subnr):
    ds = h5load(fns.betafn(subnr))
    ds = ds[:, mask_]
    ds = ds[ds.sa.condition != 'self']
    zscore(ds, chunks_attr='chunks')
    ds = mean_group_sample(['condition'])(ds)

    names = []
    dissims = []
    for roi, (center, ids) in rois.iteritems():
        names.append(roi)
        sample_roi = ds.samples[:, ids]
        dissim_roi = pdist(sample_roi, 'correlation')
        dissims.append(dissim_roi)
    dss = dataset_wizard(dissims, targets=names)
    return dss
Exemple #10
0
    def _call(self,dataset):
        data = dataset.samples
        if self.center_data:
            data = data - np.mean(data,0)

        #compute comparison sample
        mgs = mean_group_sample(['targets'])(dataset)
        comp_sample_data =  mgs[mgs.sa['targets'] == self.comparison_sample]
        #omit all samples from comparison_sample target condition
        dataset = dataset[dataset.sa.targets != self.comparison_sample]

        #calculate sample attribute of distance between sample and comparison_sample (corr coef and p value)
        dataset.sa['sample_comp_dist_r'] = [pearsonr(s.samples[0],comp_sample_data.samples[0])[0] for s in dataset]
        dataset.sa['sample_comp_dist_p'] = [pearsonr(s.samples[0],comp_sample_data.samples[0])[1] for s in dataset]
        rho, p = pearsonr(dataset.sa['sample_comp_dist_r'],dataset.sa[self.sample_covariable])
        if self.corrcoef_only:
            return Dataset(np.array([rho,]))
        else:
            return Dataset(np.array([rho,p]))
    def _call(self,dataset):
        data = dataset.samples
        if self.center_data:
            data = data - np.mean(data,0)

        #compute comparison sample
        comp_samps = mean_group_sample(['targets'])(dataset)
        #omit all samples from comparison_sample target conditions
        for om in self.targs_comps.values():
            dataset = dataset[dataset.sa.targets != om] 

        #calculate sample attribute of distance between sample and comparison_sample (corr coef and p value)
        dataset.sa['sample_comp_dist_r'] = [pearsonr(s.samples[0],comp_samps[comp_samps.sa.targets == self.targs_comps[s.sa.targets[0]]].samples[0])[0] for s in dataset]
        dataset.sa['sample_comp_dist_p'] = [pearsonr(s.samples[0],comp_samps[comp_samps.sa.targets == self.targs_comps[s.sa.targets[0]]].samples[0])[1] for s in dataset]
        #calculate final correlations
        rho, p = pearsonr(dataset.sa['sample_comp_dist_r'],dataset.sa[self.sample_covariable])
        if self.corrcoef_only:
            return Dataset(np.array([rho,]))
        else:
            return Dataset(np.array([rho,]))
    def _call(self,dataset):

        #compute comparison sample
        ds = mean_group_sample(['targets'])(dataset)
        # Get neural dissim b/w pairs of targets
        pairsim = dict((pair[0]+'-'+pair[1],(1 - pearsonr(ds[ds.sa.targets == pair[0]].samples[0], ds[ds.sa.targets == pair[1]].samples[0])[0])) for pair in self.pairs)

        #Order DMs...
        pairs_dsm_o = OrderedDict(sorted(self.pairs_dsm.items())).values()
        pairsim_o = OrderedDict(sorted(pairsim.items())).values()

        #RSA
        if self.comparison_metric == 'spearman':
            res = np.arctanh(pearsonr(rankdata(pairs_dsm_o),rankdata(pairsim_o))[0])
        elif self.comparison_metric == 'pearson':
            res = np.arctanh(pearsonr(pairs_dsm_o,pairsim_o)[0])
        elif self.comparison_metric == 'euclidean':
            res = pdist(np.vstack([self.pairs_dsm,pairsim_vals]))
            res = np.round((-1 * res) + 2) #why?
        return Dataset(np.array([res,]))
def label_examples(mri_data, beha_pkldat):

    # extract volume time-stamps from fMRI dataset (pymvpa2 Dataset)
    vol_times = mri_data.sa.time_coords
    
    # extract stimulus information from psychopy files (pandas DataFrame)
    onsets = beha_pkldat['TrialOnset'].values
    if 'trials_1.thisTrialN' in beha_pkldat: 
        trials = beha_pkldat['trials_1.thisTrialN'].values 
    else:
        trials = beha_pkldat['trials_2.thisTrialN'].values 
    
    memory_status = beha_pkldat['condition'].values
   
  
    """
    NOW CALLS THE FUNCION label_trials - which labes relevant TRs with trialnumber and memory_status name
    """
    mri_data.sa['trials'] = label_trials(onsets,trials,vol_times)
    mri_data.sa['targets'] = label_trials(onsets,memory_status,vol_times)
    
    # remove volumes that are of no interest to us
    mri_data = mri_data[mri_data.sa.targets != '_no-use_']
    #if take_mean is True then use mean of volumes as examples
    #print [t for t in mri_data.samples[4:8]]
    #print mri_data.sa.targets
    #print mri_data.shape
    
    mri_data=mri_data.get_mapped(mean_group_sample(['targets', 'trials'], order = 'occurrence'))
    #print [t for t in mri_data.samples[2]]
    print mri_data.sa.targets
    print mri_data.sa.trials
    print vol_times# = mri_data.sa.time_coords
    print mri_data.shape
    """
    IMPORTANT CHECK
    """
    print mri_data.summary()
    return mri_data
Exemple #14
0
    def test_gideon_weird_case(self):
        """Test if MappedClassifier could handle a mapper altering number of samples

        'The utter collapse' -- communicated by Peter J. Kohler

        Desire to collapse all samples per each category in training
        and testing sets, thus resulting only in a single
        sample/category per training and per testing.

        It is a peculiar scenario which pin points the problem that so
        far mappers assumed not to change number of samples
        """
        from mvpa2.mappers.fx import mean_group_sample
        from mvpa2.clfs.knn import kNN
        from mvpa2.mappers.base import ChainMapper
        ds = datasets['uni2large'].copy()
        #ds = ds[ds.sa.chunks < 9]
        accs = []
        k = 1                           # for kNN
        nf = 1                          # for NFoldPartitioner
        for i in xrange(1):          # # of random runs
            ds.samples = np.random.randn(*ds.shape)
            #
            # There are 3 ways to accomplish needed goal
            #

            # 0. Hard way: overcome the problem by manually
            #    pre-splitting/meaning in a loop
            from mvpa2.clfs.transerror import ConfusionMatrix
            partitioner = NFoldPartitioner(nf)
            meaner = mean_group_sample(['targets', 'partitions'])
            cm = ConfusionMatrix()
            te = TransferMeasure(kNN(k), Splitter('partitions'),
                                 postproc=BinaryFxNode(mean_mismatch_error,
                                                       'targets'),
                                 enable_ca = ['stats']
                                 )
            errors = []
            for part in partitioner.generate(ds):
                ds_meaned = meaner(part)
                errors.append(np.asscalar(te(ds_meaned)))
                cm += te.ca.stats
            #print i, cm.stats['ACC']
            accs.append(cm.stats['ACC'])


            if False: # not yet working -- see _tent/allow_ch_nsamples
                      # branch for attempt to make it work
                # 1. This is a "native way" IF we allow change of number
                #    of samples via _call to be done by MappedClassifier
                #    while operating solely on the mapped dataset
                clf2 = MappedClassifier(clf=kNN(k), #clf,
                                        mapper=mean_group_sample(['targets', 'partitions']))
                cv = CrossValidation(clf2, NFoldPartitioner(nf), postproc=None,
                                     enable_ca=['stats'])
                # meaning all should be ok since we should have ballanced
                # sets across all chunks here
                errors_native = cv(ds)

                self.assertEqual(np.max(np.abs(errors_native.samples[:,0] - errors)),
                                 0)

            # 2. Work without fixes to MappedClassifier allowing
            #    change of # of samples
            #
            # CrossValidation will operate on a chain mapper which
            # would perform necessary meaning first before dealing with
            # kNN cons: .stats would not be exposed since ChainMapper
            # doesn't expose them from ChainMapper (yet)
            if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active:
                raise SkipTest("Known to fail while trying to enable "
                               "training_stats for the ChainMapper")
            cv2 = CrossValidation(ChainMapper([mean_group_sample(['targets', 'partitions']),
                                               kNN(k)],
                                              space='targets'),
                                  NFoldPartitioner(nf),
                                  postproc=None)
            errors_native2 = cv2(ds)

            self.assertEqual(np.max(np.abs(errors_native2.samples[:,0] - errors)),
                             0)

            # All of the ways should provide the same results
            #print i, np.max(np.abs(errors_native.samples[:,0] - errors)), \
            #      np.max(np.abs(errors_native2.samples[:,0] - errors))

        if False: # just to investigate the distribution if we have enough iterations
            import pylab as pl
            uaccs = np.unique(accs)
            step = np.asscalar(np.unique(np.round(uaccs[1:] - uaccs[:-1], 4)))
            bins = np.linspace(0., 1., np.round(1./step+1))
            xx = pl.hist(accs, bins=bins, align='left')
            pl.xlim((0. - step/2, 1.+step/2))
Exemple #15
0
 def __init__(self, attributes, **kwargs):
     self.node = mean_group_sample(attributes)
     Transformer.__init__(self, name='sample_averager', **kwargs)
Exemple #16
0
def timesegments_classification(
        dss,
        hyper=None,
        part1=HalfPartitioner(),
        part2=NFoldPartitioner(attr='subjects'),
        window_size=6,
        overlapping_windows=True,
        distance='correlation',
        do_zscore=True):
    """Time-segment classification across subjects using Hyperalignment

    Parameters
    ----------
    dss : list of datasets
       Datasets to benchmark on.  Usually a single dataset per subject.
    hyper : Hyperalignment-like, optional
       Beast which if called on a list of datasets should spit out trained
       mappers.  If not specified, `IdentityMapper`s will be used
    part1 : Partitioner, optional
       Partitioner to split data for hyperalignment "cross-validation"
    part2 : Partitioner, optional
       Partitioner for CV within the hyperalignment test split
    window_size : int, optional
       How many temporal points to consider for a classification sample
    overlapping_windows : bool, optional
       Strategy to how create and classify "samples" for classification.  If
       True -- `window_size` samples from each time point (but trailing ones)
       constitute a sample, and upon "predict" `window_size` of samples around
       each test point is not considered.  If False -- samples are just taken
       (with training and testing splits) at `window_size` step from one to
       another.
    do_zscore : bool, optional
       Perform zscoring (overall, not per-chunk) for each dataset upon
       partitioning with part1
    ...
    """
    # Generate outer-most partitioning ()
    parts = [copy.deepcopy(part1).generate(ds) for ds in dss]

    iter = 1
    errors = []

    while True:
        try:
            dss_partitioned = [p.next() for p in parts]
        except StopIteration:
            # we are done -- no more partitions
            break
        if __debug__:
            debug("BM", "Iteration %d", iter)

        dss_train, dss_test = zip(*[list(Splitter("partitions").generate(ds))
                                    for ds in dss_partitioned])

        # TODO:  allow for doing feature selection

        if do_zscore:
            for ds in dss_train + dss_test:
                zscore(ds, chunks_attr=None)

        if hyper is not None:
            # since otherwise it would remember previous loop dataset as the "commonspace"
            # Now let's do hyperalignment but on a copy in each loop iteration
            hyper_ = copy.deepcopy(hyper)
            mappers = hyper_(dss_train)
        else:
            mappers = [IdentityMapper() for ds in dss_train]

        dss_test_aligned = [mapper.forward(ds) for mapper, ds in zip(mappers, dss_test)]

        # assign .sa.subjects to those datasets
        for i, ds in enumerate(dss_test_aligned):
            # part2.attr is by default "subjects"
            ds.sa[part2.attr] = [i]

        dss_test_bc = []
        for ds in dss_test_aligned:
            if overlapping_windows:
                startpoints = range(len(ds) - window_size + 1)
            else:
                startpoints = _get_nonoverlapping_startpoints(len(ds), window_size)
            bm = BoxcarMapper(startpoints, window_size)
            bm.train(ds)
            ds_ = bm.forward(ds)
            ds_.sa['startpoints'] = startpoints
            # reassign subjects so they are not arrays
            def assign_unique(ds, sa):
                ds.sa[sa] = [np.asscalar(np.unique(x)) for x in ds.sa[sa].value]
            assign_unique(ds_, part2.attr)

            fm = FlattenMapper()
            fm.train(ds_)
            dss_test_bc.append(ds_.get_mapped(fm))

        ds_test = vstack(dss_test_bc)
        # Perform classification across subjects comparing against mean
        # spatio-temporal pattern of other subjects
        errors_across_subjects = []
        for ds_test_part in part2.generate(ds_test):
            ds_train_, ds_test_ = list(Splitter("partitions").generate(ds_test_part))
            # average across subjects to get a representative pattern per timepoint
            ds_train_ = mean_group_sample(['startpoints'])(ds_train_)
            assert(ds_train_.shape == ds_test_.shape)

            if distance == 'correlation':
                # TODO: redo more efficiently since now we are creating full
                # corrcoef matrix.  Also we might better just take a name for
                # the pdist measure but then implement them efficiently
                # (i.e. without hstacking both pieces together first)
                dist = 1 - np.corrcoef(ds_train_, ds_test_)[len(ds_test_):, :len(ds_test_)]
            else:
                raise NotImplementedError

            if overlapping_windows:
                dist = wipe_out_offdiag(dist, window_size)

            winners = np.argmin(dist, axis=1)
            error = np.mean(winners != np.arange(len(winners)))
            errors_across_subjects.append(error)
        errors.append(errors_across_subjects)
        iter += 1

    errors = np.array(errors)
    if __debug__:
        debug("BM", "Finished with %s array of errors. Mean error %.2f"
              % (errors.shape, np.mean(errors)))
    return errors
Exemple #17
0
def group_sample_loser_measure(attrs=('targets',)):
    '''takes loser after meaning over attrs'''
    return ChainNode((mean_group_sample(attrs), sample_loser_measure()))
        else:
            print('partitioning ...')
            idxs_train, idxs_test = utils.get_train_test_splits(
                dataset, label_map, n_splits)

            if utils.check_train_test_splits(idxs_test):
                idxs_train, idxs_test = utils.get_train_test_splits(
                    dataset, label_map, n_splits)
            for word2vec_name, word2vec_features in zip(
                    word2vec_names, word2vec_vecs):
                r_squares, scores = [], []
                for fold, (idx_train, idx_test) in tqdm(
                        enumerate(zip(idxs_train, idxs_test))):
                    if average:
                        tr = dataset[idx_train].get_mapped(
                            mean_group_sample(['chunks', 'id'],
                                              order='occurrence'))
                    else:
                        tr = dataset[idx_train]
                    te = dataset[idx_test].get_mapped(
                        mean_group_sample(['chunks', 'id'],
                                          order='occurrence'))

                    #                scaler          = utils.build_model_dictionary(n_jobs=4)['RandomForest + Linear-SVM']
                    #                scaler.steps.pop(-1)

                    features_tr = np.array([
                        word2vec_features[word.lower()] for word in tr.sa.words
                    ])
                    BOLD_tr = tr.samples.astype('float32')
                    #                label_tr        = np.array([label_map[item] for item in tr.sa.targets])
                    features_te = np.array([
Exemple #19
0
    def test_gideon_weird_case(self):
        """Test if MappedClassifier could handle a mapper altering number of samples

        'The utter collapse' -- communicated by Peter J. Kohler

        Desire to collapse all samples per each category in training
        and testing sets, thus resulting only in a single
        sample/category per training and per testing.

        It is a peculiar scenario which pin points the problem that so
        far mappers assumed not to change number of samples
        """
        from mvpa2.mappers.fx import mean_group_sample
        from mvpa2.clfs.knn import kNN
        from mvpa2.mappers.base import ChainMapper
        ds = datasets['uni2large'].copy()
        #ds = ds[ds.sa.chunks < 9]
        accs = []
        k = 1  # for kNN
        nf = 1  # for NFoldPartitioner
        for i in xrange(1):  # # of random runs
            ds.samples = np.random.randn(*ds.shape)
            #
            # There are 3 ways to accomplish needed goal
            #

            # 0. Hard way: overcome the problem by manually
            #    pre-splitting/meaning in a loop
            from mvpa2.clfs.transerror import ConfusionMatrix
            partitioner = NFoldPartitioner(nf)
            meaner = mean_group_sample(['targets', 'partitions'])
            cm = ConfusionMatrix()
            te = TransferMeasure(kNN(k),
                                 Splitter('partitions'),
                                 postproc=BinaryFxNode(mean_mismatch_error,
                                                       'targets'),
                                 enable_ca=['stats'])
            errors = []
            for part in partitioner.generate(ds):
                ds_meaned = meaner(part)
                errors.append(np.asscalar(te(ds_meaned)))
                cm += te.ca.stats
            #print i, cm.stats['ACC']
            accs.append(cm.stats['ACC'])

            if False:  # not yet working -- see _tent/allow_ch_nsamples
                # branch for attempt to make it work
                # 1. This is a "native way" IF we allow change of number
                #    of samples via _call to be done by MappedClassifier
                #    while operating solely on the mapped dataset
                clf2 = MappedClassifier(
                    clf=kNN(k),  #clf,
                    mapper=mean_group_sample(['targets', 'partitions']))
                cv = CrossValidation(clf2,
                                     NFoldPartitioner(nf),
                                     postproc=None,
                                     enable_ca=['stats'])
                # meaning all should be ok since we should have ballanced
                # sets across all chunks here
                errors_native = cv(ds)

                self.assertEqual(
                    np.max(np.abs(errors_native.samples[:, 0] - errors)), 0)

            # 2. Work without fixes to MappedClassifier allowing
            #    change of # of samples
            #
            # CrossValidation will operate on a chain mapper which
            # would perform necessary meaning first before dealing with
            # kNN cons: .stats would not be exposed since ChainMapper
            # doesn't expose them from ChainMapper (yet)
            if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active:
                raise SkipTest("Known to fail while trying to enable "
                               "training_stats for the ChainMapper")
            cv2 = CrossValidation(ChainMapper(
                [mean_group_sample(['targets', 'partitions']),
                 kNN(k)],
                space='targets'),
                                  NFoldPartitioner(nf),
                                  postproc=None)
            errors_native2 = cv2(ds)

            self.assertEqual(
                np.max(np.abs(errors_native2.samples[:, 0] - errors)), 0)

            # All of the ways should provide the same results
            #print i, np.max(np.abs(errors_native.samples[:,0] - errors)), \
            #      np.max(np.abs(errors_native2.samples[:,0] - errors))

        if False:  # just to investigate the distribution if we have enough iterations
            import pylab as pl
            uaccs = np.unique(accs)
            step = np.asscalar(np.unique(np.round(uaccs[1:] - uaccs[:-1], 4)))
            bins = np.linspace(0., 1., np.round(1. / step + 1))
            xx = pl.hist(accs, bins=bins, align='left')
            pl.xlim((0. - step / 2, 1. + step / 2))
Exemple #20
0
    pl.imshow(mtx, interpolation='nearest')
    pl.xticks(range(len(mtx)), labels, rotation=-45)
    pl.yticks(range(len(mtx)), labels)
    pl.title(title)
    pl.clim((0,1))
    pl.colorbar()

"""
As a start, we want to inspect the dissimilarity structure of the stimulation
conditions in the entire ROI. For this purpose, we average all samples of
each conditions into a single examplar, using an FxMapper() instance.
"""

# compute a dataset with the mean samples for all conditions
from mvpa2.mappers.fx import mean_group_sample
mtgs = mean_group_sample(['targets'])
mtds = mtgs(ds)

"""
After these preparations we can use the PDist() measure to compute the desired
distance matrix -- by default using correlation distance as a metric. The
``square`` argument will cause a ful square matrix to be
produced, instead of a leaner upper-triangular matrix in vector form.
"""

# basic ROI RSA -- dissimilarity matrix for the entire ROI
from mvpa2.measures import rsa
dsm = rsa.PDist(square=True)
res = dsm(mtds)
plot_mtx(res, mtds.sa.targets, 'ROI pattern correlation distances')
def timesegments_classification(dss,
                                hyper=None,
                                part1=HalfPartitioner(),
                                part2=NFoldPartitioner(attr='subjects'),
                                window_size=6,
                                overlapping_windows=True,
                                distance='correlation',
                                do_zscore=True):
    """Time-segment classification across subjects using Hyperalignment

    Parameters
    ----------
    dss : list of datasets
       Datasets to benchmark on.  Usually a single dataset per subject.
    hyper : Hyperalignment-like, optional
       Beast which if called on a list of datasets should spit out trained
       mappers.  If not specified, `IdentityMapper`s will be used
    part1 : Partitioner, optional
       Partitioner to split data for hyperalignment "cross-validation"
    part2 : Partitioner, optional
       Partitioner for CV within the hyperalignment test split
    window_size : int, optional
       How many temporal points to consider for a classification sample
    overlapping_windows : bool, optional
       Strategy to how create and classify "samples" for classification.  If
       True -- `window_size` samples from each time point (but trailing ones)
       constitute a sample, and upon "predict" `window_size` of samples around
       each test point is not considered.  If False -- samples are just taken
       (with training and testing splits) at `window_size` step from one to
       another.
    do_zscore : bool, optional
       Perform zscoring (overall, not per-chunk) for each dataset upon
       partitioning with part1
    ...
    """
    # Generate outer-most partitioning ()
    parts = [copy.deepcopy(part1).generate(ds) for ds in dss]

    iter = 1
    errors = []

    while True:
        try:
            dss_partitioned = [p.next() for p in parts]
        except StopIteration:
            # we are done -- no more partitions
            break
        if __debug__:
            debug("BM", "Iteration %d", iter)

        dss_train, dss_test = zip(*[
            list(Splitter("partitions").generate(ds)) for ds in dss_partitioned
        ])

        # TODO:  allow for doing feature selection

        if do_zscore:
            for ds in dss_train + dss_test:
                zscore(ds, chunks_attr=None)

        if hyper is not None:
            # since otherwise it would remember previous loop dataset as the "commonspace"
            # Now let's do hyperalignment but on a copy in each loop iteration
            hyper_ = copy.deepcopy(hyper)
            mappers = hyper_(dss_train)
        else:
            mappers = [IdentityMapper() for ds in dss_train]

        dss_test_aligned = [
            mapper.forward(ds) for mapper, ds in zip(mappers, dss_test)
        ]

        # assign .sa.subjects to those datasets
        for i, ds in enumerate(dss_test_aligned):
            # part2.attr is by default "subjects"
            ds.sa[part2.attr] = [i]

        dss_test_bc = []
        for ds in dss_test_aligned:
            if overlapping_windows:
                startpoints = range(len(ds) - window_size + 1)
            else:
                startpoints = _get_nonoverlapping_startpoints(
                    len(ds), window_size)
            bm = BoxcarMapper(startpoints, window_size)
            bm.train(ds)
            ds_ = bm.forward(ds)
            ds_.sa['startpoints'] = startpoints

            # reassign subjects so they are not arrays
            def assign_unique(ds, sa):
                ds.sa[sa] = [
                    np.asscalar(np.unique(x)) for x in ds.sa[sa].value
                ]

            assign_unique(ds_, part2.attr)

            fm = FlattenMapper()
            fm.train(ds_)
            dss_test_bc.append(ds_.get_mapped(fm))

        ds_test = vstack(dss_test_bc)
        # Perform classification across subjects comparing against mean
        # spatio-temporal pattern of other subjects
        errors_across_subjects = []
        for ds_test_part in part2.generate(ds_test):
            ds_train_, ds_test_ = list(
                Splitter("partitions").generate(ds_test_part))
            # average across subjects to get a representative pattern per timepoint
            ds_train_ = mean_group_sample(['startpoints'])(ds_train_)
            assert (ds_train_.shape == ds_test_.shape)

            if distance == 'correlation':
                # TODO: redo more efficiently since now we are creating full
                # corrcoef matrix.  Also we might better just take a name for
                # the pdist measure but then implement them efficiently
                # (i.e. without hstacking both pieces together first)
                dist = 1 - np.corrcoef(
                    ds_train_, ds_test_)[len(ds_test_):, :len(ds_test_)]
            else:
                raise NotImplementedError

            if overlapping_windows:
                dist = wipe_out_offdiag(dist, window_size)

            winners = np.argmin(dist, axis=1)
            error = np.mean(winners != np.arange(len(winners)))
            errors_across_subjects.append(error)
        errors.append(errors_across_subjects)
        iter += 1

    errors = np.array(errors)
    if __debug__:
        debug(
            "BM", "Finished with %s array of errors. Mean error %.2f" %
            (errors.shape, np.mean(errors)))
    return errors
Exemple #22
0
        return dss[0].a.mapper.reverse(ds)

    import pylab as pl
    pl.clf()
    DS = dsvstack(dss)
    # Sample plots
    for s in [0, 1]:
        ds2 = get2d(dss[0])
        for r in [0, 1]:
            pl.subplot(3, 3, 1 + r + s * 3)
            pl.imshow(ds2[ds2.sa.chunks == r].samples[0],
                      interpolation='nearest')
            pl.ylabel('subj%d' % s)
            pl.xlabel('run1')
        pl.subplot(3, 3, 3 + s * 3)
        pl.imshow(get2d(mean_group_sample(['dissimilarity'
                                           ])(dss[0]).samples)[0],
                  interpolation='nearest')
        pl.xlabel('mean')

    ds = dsvstack(dss)
    ds.a['mapper'] = dss[0].a.mapper
    ds_mean = mean_group_sample(['dissimilarity', 'chunks'])(ds)
    for r in [0, 1]:
        ds_mean_run0 = ds.a.mapper.reverse(ds_mean[ds_mean.chunks == r])
        pl.subplot(3, 3, 1 + r + 2 * 3)
        pl.imshow(ds_mean_run0.samples[0], interpolation='nearest')
        pl.ylabel('mean(subj)')
        pl.xlabel('run%d' % r)
    ds_global_mean = mean_group_sample(['dissimilarity'])(ds)
    pl.subplot(3, 3, 3 + 2 * 3)
    pl.imshow(get2d(ds_global_mean).samples[0], interpolation='nearest')
Exemple #23
0
def test_hrf_modeling():
    skip_if_no_external('nibabel')
    skip_if_no_external('nipy')  # ATM relies on NiPy's GLM implementation
    ds = load_example_fmri_dataset('25mm')  #literal=True)
    # TODO: simulate short dataset with known properties and use it
    # for testing
    events = find_events(targets=ds.sa.targets, chunks=ds.sa.chunks)
    tr = ds.a.imghdr['pixdim'][4]
    for ev in events:
        for a in ('onset', 'duration'):
            ev[a] = ev[a] * tr
    evds = eventrelated_dataset(ds,
                                events,
                                time_attr='time_coords',
                                condition_attr='targets',
                                design_kwargs=dict(drift_model='blank'),
                                glmfit_kwargs=dict(model='ols'),
                                model='hrf')
    # same voxels
    assert_equal(ds.nfeatures, evds.nfeatures)
    assert_array_equal(ds.fa.voxel_indices, evds.fa.voxel_indices)
    # one sample for each condition, plus constant
    assert_equal(sorted(ds.sa['targets'].unique), sorted(evds.sa.targets))
    assert_equal(evds.a.add_regs.sa.regressor_names[0], 'constant')
    # with centered data
    zscore(ds)
    evds_demean = eventrelated_dataset(ds,
                                       events,
                                       time_attr='time_coords',
                                       condition_attr='targets',
                                       design_kwargs=dict(drift_model='blank'),
                                       glmfit_kwargs=dict(model='ols'),
                                       model='hrf')
    # after demeaning the constant should consume a lot less
    assert (evds.a.add_regs[0].samples.mean() >
            evds_demean.a.add_regs[0].samples.mean())
    # from eyeballing the sensitivity example -- would be better to test this on
    # the tutorial data
    assert(evds_demean[evds.sa.targets == 'shoe'].samples.max() \
           > evds_demean[evds.sa.targets == 'bottle'].samples.max())
    # HRF models
    assert ('regressors' in evds.sa)
    assert ('regressors' in evds.a.add_regs.sa)
    assert_equal(evds.sa.regressors.shape[1], len(ds))

    # custom regressors
    evds_regrs = eventrelated_dataset(ds,
                                      events,
                                      time_attr='time_coords',
                                      condition_attr='targets',
                                      regr_attrs=['time_indices'],
                                      design_kwargs=dict(drift_model='blank'),
                                      glmfit_kwargs=dict(model='ols'),
                                      model='hrf')
    # verify that nothing screwed up time_coords
    assert_equal(ds.sa.time_coords[0], 0)
    assert_equal(len(evds_regrs), len(evds))
    # one more output sample in .a.add_regs
    assert_equal(len(evds_regrs.a.add_regs) - 1, len(evds.a.add_regs))
    # comes last before constant
    assert_equal('time_indices', evds_regrs.a.add_regs.sa.regressor_names[-2])
    # order of main regressors is unchanged
    assert_array_equal(evds.sa.targets, evds_regrs.sa.targets)

    # custom regressors from external sources
    evds_regrs = eventrelated_dataset(
        ds,
        events,
        time_attr='time_coords',
        condition_attr='targets',
        regr_attrs=['time_coords'],
        design_kwargs=dict(drift_model='blank',
                           add_regs=np.linspace(1, -1, len(ds))[None].T,
                           add_reg_names=['negative_trend']),
        glmfit_kwargs=dict(model='ols'),
        model='hrf')
    assert_equal(len(evds_regrs), len(evds))
    # But we got one more in additional regressors
    assert_equal(len(evds_regrs.a.add_regs) - 2, len(evds.a.add_regs))
    # comes last before constant
    assert_array_equal(['negative_trend', 'time_coords', 'constant'],
                       evds_regrs.a.add_regs.sa.regressor_names)
    # order is otherwise unchanged
    assert_array_equal(evds.sa.targets, evds_regrs.sa.targets)

    # HRF models with estimating per each chunk
    assert_equal(ds.sa.time_coords[0], 0)
    evds_regrs = eventrelated_dataset(ds,
                                      events,
                                      time_attr='time_coords',
                                      condition_attr=['targets', 'chunks'],
                                      regr_attrs=['time_indices'],
                                      design_kwargs=dict(drift_model='blank'),
                                      glmfit_kwargs=dict(model='ols'),
                                      model='hrf')
    assert_true('add_regs' in evds_regrs.a)
    assert_true('time_indices' in evds_regrs.a.add_regs.sa.regressor_names)

    assert_equal(len(ds.UC) * len(ds.UT), len(evds_regrs))
    assert_equal(len(evds_regrs.UC) * len(evds_regrs.UT), len(evds_regrs))

    from mvpa2.mappers.fx import mean_group_sample
    evds_regrs_meaned = mean_group_sample(['targets'])(evds_regrs)
    assert_array_equal(evds_regrs_meaned.T,
                       evds.T)  # targets should be the same
Exemple #24
0
 def __init__(self, attributes, **kwargs):
     self.node = mean_group_sample(attributes)
     Transformer.__init__(self, name='sample_averager', **kwargs)
Exemple #25
0
def plot_mtx(mtx, labels, title):
    pl.figure()
    pl.imshow(mtx, interpolation='nearest')
    pl.xticks(range(len(mtx)), labels, rotation=-45)
    pl.yticks(range(len(mtx)), labels)
    pl.title(title)
    pl.clim((0, 2))
    pl.colorbar()
    
"""
As a start, we want to inspect the dissimilarity structure of the stimulation conditions in the entire ROI. For this purpose, we average all samples of each conditions into a single exemplar, using an FxMapper() instance.
"""

# compute a dataset with the mean samples for all conditions
from mvpa2.mappers.fx import mean_group_sample
mtgs = mean_group_sample(['targets'])
mtds = mtgs(ds)
After these preparations we can use the PDist() measure to compute the desired distance matrix – by default using correlation distance as a metric. The square argument will cause a ful square matrix to be produced, instead of a leaner upper-triangular matrix in vector form.

# basic ROI RSA -- dissimilarity matrix for the entire ROI
from mvpa2.measures import rsa
dsm = rsa.PDist(square=True)
res = dsm(mtds)
plot_mtx(res, mtds.sa.targets, 'ROI pattern correlation distances')
Inspecting the figure we can see that there is not much structure in the matrix, except for the face and the house condition being slightly more dissimilar than others.

"""
Now, let’s take a look at the variation of similarity structure through the brain. We can plug the PDist() measure into a searchlight to quickly scan the brain and harvest this information.
"""

# same as above, but done in a searchlight fashion
Exemple #26
0
# In[ ]:

zscore(fds, param_est=('targets', ['rest']))

# ### Remueve los volúmenes asignados a línea base

# In[ ]:

fds = fds[fds.sa.targets != 'rest']

# ### Promedia los volúmenes

# In[ ]:

from mvpa2.mappers.fx import mean_group_sample
mtgs = mean_group_sample(['targets'])
mtds = mtgs(ds)

# ### Mide las distancias entre categorías

# In[ ]:

dsm = rsa.PDist(square=True)
res = dsm(mtds)

# ### Muestra los resultados en una figura

# In[ ]:

plot_mtx(res, mtds.sa.targets, 'ROI pattern correlation distances')
Exemple #27
0
def timesegments_classification(dss,
                                window_size=6,
                                overlapping_windows=True,
                                distance='correlation',
                                do_zscore=True):
    """Time-segment classification across subjects using Hyperalignment

    Parameters
    ----------
    dss : list of datasets
       Datasets to benchmark on.  Usually a single dataset per subject.
    window_size : int, optional
       How many temporal points to consider for a classification sample
    overlapping_windows : bool, optional
       Strategy to how create and classify "samples" for classification.  If
       True -- `window_size` samples from each time point (but trailing ones)
       constitute a sample, and upon "predict" `window_size` of samples around
       each test point is not considered.  If False -- samples are just taken
       (with training and testing splits) at `window_size` step from one to
       another.
    do_zscore : bool, optional
       Perform zscoring (overall, not per-chunk) for each dataset upon
       partitioning with part1
    ...
    """
    part2 = NFoldPartitioner(attr='subjects')
    # Check if input list contains Datasets, ndarrays
    dss = [Dataset(ds) if not type(ds) == Dataset else ds for ds in dss]
    # TODO:  allow for doing feature selection
    if do_zscore:
        for ds in dss:
            zscore(ds, chunks_attr=None)

    # assign .sa.subjects to those datasets
    for i, ds in enumerate(dss):
        # part2.attr is by default "subjects"
        ds.sa[part2.attr] = [i]

    dss_test_bc = []
    for ds in dss:
        if overlapping_windows:
            startpoints = range(len(ds) - window_size + 1)
        else:
            startpoints = _get_nonoverlapping_startpoints(len(ds), window_size)
        bm = BoxcarMapper(startpoints, window_size)
        bm.train(ds)
        ds_ = bm.forward(ds)
        ds_.sa['startpoints'] = startpoints

        # reassign subjects so they are not arrays
        def assign_unique(ds, sa):
            ds.sa[sa] = [np.asscalar(np.unique(x)) for x in ds.sa[sa].value]

        assign_unique(ds_, part2.attr)

        fm = FlattenMapper()
        fm.train(ds_)
        dss_test_bc.append(ds_.get_mapped(fm))

    ds_test = vstack(dss_test_bc)
    # Perform classification across subjects comparing against mean
    # spatio-temporal pattern of other subjects
    errors_across_subjects = []
    for ds_test_part in part2.generate(ds_test):
        ds_train_, ds_test_ = list(
            Splitter("partitions").generate(ds_test_part))
        # average across subjects to get a representative pattern per timepoint
        ds_train_ = mean_group_sample(['startpoints'])(ds_train_)
        assert (ds_train_.shape == ds_test_.shape)

        if distance == 'correlation':
            # TODO: redo more efficiently since now we are creating full
            # corrcoef matrix.  Also we might better just take a name for
            # the pdist measure but then implement them efficiently
            # (i.e. without hstacking both pieces together first)
            dist = 1 - np.corrcoef(ds_train_,
                                   ds_test_)[len(ds_test_):, :len(ds_test_)]
        else:
            raise NotImplementedError

        if overlapping_windows:
            dist = wipe_out_offdiag(dist, window_size)

        winners = np.argmin(dist, axis=1)
        error = np.mean(winners != np.arange(len(winners)))
        errors_across_subjects.append(error)

    errors_across_subjects = np.asarray(errors_across_subjects)
    if __debug__:
        debug(
            "BM", "Finished with %s array of errors. Mean error %.2f" %
            (errors_across_subjects.shape, np.mean(errors_across_subjects)))
    return errors_across_subjects
Exemple #28
0
    fn = prefix + '*' + suffix
    files = sorted(glob.glob(fn))
    for x in range(len(files)):
        if x < 5:
            chunks = [x + 1] * 20
        else:
            chunks = [x - 5 + 1] * 20
        d = mv.gifti_dataset(files[x], chunks=chunks, targets=conditions)
        d.sa['conditions'] = conditions
        if ds is None:
            ds = d
        else:
            ds = mv.vstack((ds, d))
    ds.fa['node_indices'] = range(ds.shape[1])
    ds.samples = zscore(ds.samples, axis=1)
    mtgs = mean_group_sample(['conditions'])
    mtds = mtgs(ds)
    slres = sl(mtds)
    slres.samples = np.nan_to_num(slres.samples)
    all_slres.append(slres.samples)

# all_slres has all (190, 40962) RDMs for each subject
# now we need ISCs
# (12, 190, 40962)
# list of 40962 items (12, 190)
all_slres = np.array(all_slres)
all_slres = np.swapaxes(all_slres, 0, 2)

results = []
for sl_data in all_slres:
    # now i have a 190 by 12 matrix
Exemple #29
0
def main(subject,
         study_dir,
         mask,
         feature_mask,
         models,
         category,
         res_name,
         suffix='_stim_fix2',
         radius=3,
         n_perm=1000,
         n_proc=None):
    from mvpa2.mappers.zscore import zscore
    from mvpa2.mappers.fx import mean_group_sample
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.datasets.mri import map2nifti
    from wikisim import mvpa

    # load subject data
    sp = su.SubjPath(subject, study_dir)
    vols = task.prex_vols(sp.path('behav', 'log'))

    # load fmri data
    ds = mvpa.load_prex_beta(sp,
                             suffix,
                             mask,
                             feature_mask=feature_mask,
                             verbose=1)

    # zscore
    ds.sa['run'] = vols.run.values
    zscore(ds, chunks_attr='run')

    # average over item presentations
    ds.sa['itemno'] = vols.itemno.to_numpy()
    m = mean_group_sample(['itemno'])
    dsm = ds.get_mapped(m)

    # get items of interest
    if category == 'face':
        cond = [1, 2]
    elif category == 'scene':
        cond = [3, 4]
    else:
        ValueError(f'Invalid category code: {category}')
    include = vols.groupby('itemno').first()['cond'].isin(cond)

    # get models of interest
    model_dir = os.path.join(study_dir, 'batch', 'models3')
    model_names = models.split('-')
    model_rdms_dict = model.load_category_rdms(model_dir, category,
                                               model_names)
    model_rdms = [model_rdms_dict[name] for name in model_names]

    # set up searchlight
    m = mvpa.ItemPartialRSA(model_rdms, n_perm)
    sl = sphere_searchlight(m, radius=radius, nproc=n_proc)
    sl_map = sl(dsm[include])

    nifti_include = map2nifti(ds, sl_map[-1])
    for i, name in enumerate(model_names):
        # save zstat map
        res_dir = sp.path('rsa', f'{res_name}_{name}')
        if not os.path.exists(res_dir):
            os.makedirs(res_dir)
        filepath = os.path.join(res_dir, 'zstat.nii.gz')
        nifti = map2nifti(ds, sl_map[i])
        nifti.to_filename(filepath)

        # save mask of included voxels
        include_file = os.path.join(res_dir, 'included.nii.gz')
        nifti_include.to_filename(include_file)
Exemple #30
0
def get_dsm_roi_secondorder_xval2(ds,
                                  rois,
                                  zscore_ds=True,
                                  part=OddEvenPartitioner(),
                                  cond_chunk='condition'):
    """ Obtain second-order dissimilarities between ROIs. This version
    cross-validates at the second level, thus the resulting dsms are 
    not symmetrical.

    Arguments
    --------
    ds: dataset
    rois: dict
        each item in the dictionary must be a tuple where the 0th element is
        the center of the roi, and the 1st element is a list of ids
    zscore_ds: bool
        is the dset already zscored?
    part: partitioner
    cond_chunk: str
        across which sample attribute to perform mean group sample

    Returns
    -------
    dataset containing second level dsm
    """
    #ds = h5load(fns.betafn(subnr))
    #ds = ds[:, mask_]
    #ds = ds[ds.sa.condition != 'self']
    if zscore_ds:
        zscore(ds, chunks_attr='chunks')

    # set up oddeven partition
    #part = OddEvenPartitioner()

    rdms = []
    mgs = mean_group_sample([cond_chunk])
    dissims_folds = []
    for ds_ in part.generate(ds):
        ds_1 = ds_[ds_.sa.partitions == 1]
        ds_2 = ds_[ds_.sa.partitions == 2]

        ds_1 = mgs(ds_1)
        ds_2 = mgs(ds_2)
        assert (ds_1.samples.shape == ds_2.samples.shape)

        # first generate first-order rdms for each fold
        names = []
        centers = []
        dissims_1 = []
        dissims_2 = []
        for roi, (center, ids) in rois.iteritems():
            names.append(roi)
            centers.append(center)

            sample1_roi = ds_1.samples[:, ids]
            sample2_roi = ds_2.samples[:, ids]

            dissim1_roi = pdist(sample1_roi, 'correlation')
            dissim2_roi = pdist(sample2_roi, 'correlation')

            dissims_1.append(dissim1_roi)
            dissims_2.append(dissim2_roi)

        dss1 = np.array(dissims_1)
        dss2 = np.array(dissims_2)

        # now compute second-order rdm correlating across folds
        dissim_2ndorder = 1. - corrcoefxy(dss1.T, dss2.T)
        dissim_2ndorder = dataset_wizard(dissim_2ndorder, targets=names)
        dissim_2ndorder.sa['centers'] = centers
        # also add fa information about roi
        dissim_2ndorder.fa['roi'] = names
        dissims_folds.append(dissim_2ndorder)

    # average
    dissims = dissims_folds[0]
    for d in dissims_folds[1:]:
        dissims.samples += d.samples
    dissims.samples /= len(dissims_folds)
    return dissims
Exemple #31
0
def test_hrf_modeling():
    skip_if_no_external('nibabel')
    skip_if_no_external('nipy') # ATM relies on NiPy's GLM implementation
    ds = load_example_fmri_dataset('25mm') #literal=True)
    # TODO: simulate short dataset with known properties and use it
    # for testing
    events = find_events(targets=ds.sa.targets, chunks=ds.sa.chunks)
    tr = ds.a.imghdr['pixdim'][4]
    for ev in events:
        for a in ('onset', 'duration'):
            ev[a] = ev[a] * tr
    evds = eventrelated_dataset(ds, events, time_attr='time_coords',
                                condition_attr='targets',
                                design_kwargs=dict(drift_model='blank'),
                                glmfit_kwargs=dict(model='ols'),
                                model='hrf')
    # same voxels
    assert_equal(ds.nfeatures, evds.nfeatures)
    assert_array_equal(ds.fa.voxel_indices, evds.fa.voxel_indices)
    # one sample for each condition, plus constant
    assert_equal(sorted(ds.sa['targets'].unique), sorted(evds.sa.targets))
    assert_equal(evds.a.add_regs.sa.regressor_names[0], 'constant')
    # with centered data
    zscore(ds)
    evds_demean = eventrelated_dataset(ds, events, time_attr='time_coords',
                                condition_attr='targets',
                                design_kwargs=dict(drift_model='blank'),
                                glmfit_kwargs=dict(model='ols'),
                                model='hrf')
    # after demeaning the constant should consume a lot less
    assert(evds.a.add_regs[0].samples.mean()
           > evds_demean.a.add_regs[0].samples.mean())
    # from eyeballing the sensitivity example -- would be better to test this on
    # the tutorial data
    assert(evds_demean[evds.sa.targets == 'shoe'].samples.max() \
           > evds_demean[evds.sa.targets == 'bottle'].samples.max())
    # HRF models
    assert('regressors' in evds.sa)
    assert('regressors' in evds.a.add_regs.sa)
    assert_equal(evds.sa.regressors.shape[1], len(ds))

    # custom regressors
    evds_regrs = eventrelated_dataset(ds, events, time_attr='time_coords',
                                condition_attr='targets',
                                regr_attrs=['time_indices'],
                                design_kwargs=dict(drift_model='blank'),
                                glmfit_kwargs=dict(model='ols'),
                                model='hrf')
    # verify that nothing screwed up time_coords
    assert_equal(ds.sa.time_coords[0], 0)
    assert_equal(len(evds_regrs), len(evds))
    # one more output sample in .a.add_regs
    assert_equal(len(evds_regrs.a.add_regs) - 1, len(evds.a.add_regs))
    # comes last before constant
    assert_equal('time_indices', evds_regrs.a.add_regs.sa.regressor_names[-2])
    # order of main regressors is unchanged
    assert_array_equal(evds.sa.targets, evds_regrs.sa.targets)

    # custom regressors from external sources
    evds_regrs = eventrelated_dataset(ds, events, time_attr='time_coords',
                                condition_attr='targets',
                                regr_attrs=['time_coords'],
                                design_kwargs=dict(drift_model='blank',
                                                   add_regs=np.linspace(1, -1, len(ds))[None].T,
                                                   add_reg_names=['negative_trend']),
                                glmfit_kwargs=dict(model='ols'),
                                model='hrf')
    assert_equal(len(evds_regrs), len(evds))
    # But we got one more in additional regressors
    assert_equal(len(evds_regrs.a.add_regs) - 2, len(evds.a.add_regs))
    # comes last before constant
    assert_array_equal(['negative_trend', 'time_coords', 'constant'],
                       evds_regrs.a.add_regs.sa.regressor_names)
    # order is otherwise unchanged
    assert_array_equal(evds.sa.targets, evds_regrs.sa.targets)

    # HRF models with estimating per each chunk
    assert_equal(ds.sa.time_coords[0], 0)
    evds_regrs = eventrelated_dataset(ds, events, time_attr='time_coords',
                                condition_attr=['targets', 'chunks'],
                                regr_attrs=['time_indices'],
                                design_kwargs=dict(drift_model='blank'),
                                glmfit_kwargs=dict(model='ols'),
                                model='hrf')
    assert_true('add_regs' in evds_regrs.a)
    assert_true('time_indices' in evds_regrs.a.add_regs.sa.regressor_names)

    assert_equal(len(ds.UC) * len(ds.UT), len(evds_regrs))
    assert_equal(len(evds_regrs.UC) * len(evds_regrs.UT), len(evds_regrs))

    from mvpa2.mappers.fx import mean_group_sample
    evds_regrs_meaned = mean_group_sample(['targets'])(evds_regrs)
    assert_array_equal(evds_regrs_meaned.T, evds.T) # targets should be the same
Exemple #32
0
def get_dsm_roi_xval1(ds,
                      rois,
                      zscore_ds=True,
                      part=OddEvenPartitioner(),
                      cond_chunk='condition'):
    """ Obtain second-order dissimilarities between ROIs. This version
    cross-validates at the first level, thus the resulting dsms are 
    symmetrical.

    Arguments
    --------
    ds: dataset
    rois: dict
        each item in the dictionary must be a tuple where the 0th element is
        the center of the roi, and the 1st element is a list of ids
    zscore_ds: bool
        is the dset already zscored?
    part: partitioner
    cond_chunk: str
        across which sample attribute to perform mean group sample

    Returns
    -------
    dataset containing second level dsm
    """
    #ds = h5load(fns.betafn(subnr))
    #ds = ds[:, mask_]
    #ds = ds[ds.sa.condition != 'self']
    if zscore_ds:
        zscore(ds, chunks_attr='chunks')

    # set up oddeven partition
    #part = OddEvenPartitioner()

    rdms = []
    mgs = mean_group_sample([cond_chunk])
    dissims_folds = []
    for ds_ in part.generate(ds):
        ds_1 = ds_[ds_.sa.partitions == 1]
        ds_2 = ds_[ds_.sa.partitions == 2]

        ds_1 = mgs(ds_1)
        ds_2 = mgs(ds_2)
        assert (ds_1.samples.shape == ds_2.samples.shape)

        # first generate first-order rdms cross-validated across folds
        names = []
        centers = []
        dissims = []
        for roi, (center, ids) in rois.iteritems():
            names.append(roi)
            centers.append(center)

            sample1_roi = ds_1.samples[:, ids]
            sample2_roi = ds_2.samples[:, ids]

            dissim_roi = 1. - corrcoefxy(sample1_roi.T, sample2_roi.T)
            nsamples = ds_1.nsamples
            assert (dissim_roi.shape == (nsamples, nsamples))

            dissims.append(
                dissim_roi.flatten())  # now the RDM is not symmetrical anymore
        dissims_folds.append(np.array(dissims))

    # average across folds
    dissims_folds = np.array(dissims_folds).mean(axis=0)
    assert (dissims_folds.shape == (len(names), nsamples**2))

    # now compute second level (distances)
    distance_roi = dist.pdist(dissims_folds, metric='correlation')

    dissims_folds = dataset_wizard(dist.squareform(distance_roi),
                                   targets=names)
    dissims_folds.fa['roi'] = names
    dissims_folds.sa['centers'] = centers

    return dissims_folds
Exemple #33
0
class SearchlightTests(unittest.TestCase):
    def setUp(self):
        self.dataset = datasets['3dlarge']
        # give the feature coord a more common name, matching the default of
        # the searchlight
        self.dataset.fa['voxel_indices'] = self.dataset.fa.myspace
        self._tested_pprocess = False

    # https://github.com/PyMVPA/PyMVPA/issues/67
    # https://github.com/PyMVPA/PyMVPA/issues/69
    def test_gnbsearchlight_doc(self):
        # Test either we excluded nproc from the docstrings
        ok_(not 'nproc' in GNBSearchlight.__init__.__doc__)
        ok_(not 'nproc' in GNBSearchlight.__doc__)
        ok_(not 'nproc' in sphere_gnbsearchlight.__doc__)
        # but present elsewhere
        ok_('nproc' in sphere_searchlight.__doc__)
        ok_('nproc' in Searchlight.__init__.__doc__)

    # https://github.com/PyMVPA/PyMVPA/issues/106
    def test_searchlights_doc_qe(self):
        # queryengine should not be provided to sphere_* helpers
        for sl in (sphere_searchlight, sphere_gnbsearchlight,
                   sphere_m1nnsearchlight):
            for kw in ('queryengine', 'qe'):
                ok_(not kw in sl.__doc__,
                    msg='There should be no %r in %s.__doc__' % (kw, sl))

        # queryengine should be provided in corresponding classes __doc__s
        for sl in (Searchlight, GNBSearchlight, M1NNSearchlight):
            for kw in ('queryengine', ):
                ok_(kw in sl.__init__.__doc__,
                    msg='There should be %r in %s.__init__.__doc__' % (kw, sl))
            for kw in ('qe', ):
                ok_(not kw in sl.__init__.__doc__,
                    msg='There should be no %r in %s.__init__.__doc__' %
                    (kw, sl))

    #def _test_searchlights(self, ds, sls, roi_ids, result_all):  # pragma: no cover

    @sweepargs(
        lrn_sllrn_SL_partitioner=[
            (
                GNB(common_variance=v, descr='GNB'),
                None,
                sphere_gnbsearchlight,
                NFoldPartitioner(cvtype=1),
                0.  # correction for the error range
            ) for v in (True, False)
        ] +
        # Mean 1 NN searchlights
        [
            (ChainMapper(
                [mean_group_sample(['targets', 'partitions']),
                 kNN(1)],
                space='targets',
                descr='M1NN'), kNN(1), sphere_m1nnsearchlight,
             NFoldPartitioner(0.5, selection_strategy='random',
                              count=20), 0.05),
            # the same but with NFold(1) partitioner since it still should work
            (ChainMapper(
                [mean_group_sample(['targets', 'partitions']),
                 kNN(1)],
                space='targets',
                descr='NF-M1NN'), kNN(1), sphere_m1nnsearchlight,
             NFoldPartitioner(1), 0.05),
        ])
    @sweepargs(do_roi=(False, True))
    @sweepargs(results_backend=('native', 'hdf5'))
    @reseed_rng()
    def test_spatial_searchlight(self,
                                 lrn_sllrn_SL_partitioner,
                                 do_roi=False,
                                 results_backend='native'):
        """Tests both generic and ad-hoc searchlights (e.g. GNBSearchlight)
        Test of and adhoc searchlight anyways requires a ground-truth
        comparison to the generic version, so we are doing sweepargs here
        """
        lrn, sllrn, SL, partitioner, correction = lrn_sllrn_SL_partitioner
        ## if results_backend == 'hdf5' and not common_variance:
        ##     # no need for full combination of all possible arguments here
        ##     return

        if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active \
           and  isinstance(lrn, ChainMapper):
            raise SkipTest("Known to fail while trying to enable "
                           "training_stats for the ChainMapper (M1NN here)")

        # e.g. for M1NN we need plain kNN(1) for m1nnsl, but to imitate m1nn
        #      "learner" we must use a chainmapper atm
        if sllrn is None:
            sllrn = lrn
        ds = datasets['3dsmall'].copy()
        # Let's test multiclass here, so boost # of labels
        ds[6:18].T += 2
        ds.fa['voxel_indices'] = ds.fa.myspace

        # To assure that users do not run into incorrect operation due to overflows
        ds.samples += 5000
        ds.samples *= 1000
        ds.samples = ds.samples.astype(np.int16)

        # compute N-1 cross-validation for each sphere
        # YOH: unfortunately sample_clf_lin is not guaranteed
        #      to provide exactly the same results due to inherent
        #      iterative process.  Therefore lets use something quick
        #      and pure Python
        cv = CrossValidation(lrn, partitioner)

        skwargs = dict(
            radius=1,
            enable_ca=['roi_sizes', 'raw_results', 'roi_feature_ids'])

        if do_roi:
            # select some random set of features
            nroi = rnd.randint(1, ds.nfeatures)
            # and lets compute the full one as well once again so we have a reference
            # which will be excluded itself from comparisons but values will be compared
            # for selected roi_id
            sl_all = SL(sllrn, partitioner, **skwargs)
            result_all = sl_all(ds)
            # select random features
            roi_ids = rnd.permutation(range(ds.nfeatures))[:nroi]
            skwargs['center_ids'] = roi_ids
        else:
            nroi = ds.nfeatures
            roi_ids = np.arange(nroi)
            result_all = None

        if results_backend == 'hdf5':
            skip_if_no_external('h5py')

        sls = [
            sphere_searchlight(cv, results_backend=results_backend, **skwargs),
            #GNBSearchlight(gnb, NFoldPartitioner(cvtype=1))
            SL(sllrn, partitioner, indexsum='fancy', **skwargs)
        ]

        if externals.exists('scipy'):
            sls += [SL(sllrn, partitioner, indexsum='sparse', **skwargs)]

        # Test nproc just once
        if externals.exists('pprocess') and not self._tested_pprocess:
            sls += [sphere_searchlight(cv, nproc=2, **skwargs)]
            self._tested_pprocess = True

        # Provide the dataset and all those searchlights for testing
        #self._test_searchlights(ds, sls, roi_ids, result_all)
        #nroi = len(roi_ids)
        #do_roi = nroi != ds.nfeatures
        all_results = []
        for sl in sls:
            # run searchlight
            mvpa2.seed()  # reseed rng again for m1nnsl
            results = sl(ds)
            all_results.append(results)
            #print `sl`
            # check for correct number of spheres
            self.assertTrue(results.nfeatures == nroi)
            # and measures (one per xfold)
            if partitioner.cvtype == 1:
                self.assertTrue(len(results) == len(ds.UC))
            elif partitioner.cvtype == 0.5:
                # here we had 4 unique chunks, so 6 combinations
                # even though 20 max was specified for NFold
                self.assertTrue(len(results) == 6)
            else:
                raise RuntimeError("Unknown yet type of partitioner to check")
            # check for chance-level performance across all spheres
            # makes sense only if number of features was big enough
            # to get some stable estimate of mean
            if not do_roi or nroi > 20:
                # correction here is for M1NN class which has wider distribution
                self.assertTrue(0.67 - correction < results.samples.mean() <
                                0.85 + correction,
                                msg="Out of range mean result: "
                                "lrn: %s  sllrn: %s  NROI: %d  MEAN: %.3f" % (
                                    lrn,
                                    sllrn,
                                    nroi,
                                    results.samples.mean(),
                                ))

            mean_errors = results.samples.mean(axis=0)
            # that we do get different errors ;)
            self.assertTrue(len(np.unique(mean_errors) > 3))

            # check resonable sphere sizes
            self.assertTrue(len(sl.ca.roi_sizes) == nroi)
            self.assertTrue(len(sl.ca.roi_feature_ids) == nroi)
            for i, fids in enumerate(sl.ca.roi_feature_ids):
                self.assertTrue(len(fids) == sl.ca.roi_sizes[i])
            if do_roi:
                # for roi we should relax conditions a bit
                self.assertTrue(max(sl.ca.roi_sizes) <= 7)
                self.assertTrue(min(sl.ca.roi_sizes) >= 4)
            else:
                self.assertTrue(max(sl.ca.roi_sizes) == 7)
                self.assertTrue(min(sl.ca.roi_sizes) == 4)

            # check base-class state
            self.assertEqual(sl.ca.raw_results.nfeatures, nroi)

            # Test if we got results correctly for 'selected' roi ids
            if do_roi:
                assert_array_equal(result_all[:, roi_ids], results)

        if len(all_results) > 1:
            # if we had multiple searchlights, we can check either they all
            # gave the same result (they should have)
            aresults = np.array([a.samples for a in all_results])
            dresults = np.abs(aresults - aresults.mean(axis=0))
            dmax = np.max(dresults)
            self.assertTrue(dmax <= 1e-13)

        # Test the searchlight's reuse of neighbors
        for indexsum in ['fancy'] + (externals.exists('scipy') and ['sparse']
                                     or []):
            sl = SL(sllrn,
                    partitioner,
                    indexsum='fancy',
                    reuse_neighbors=True,
                    **skwargs)
            mvpa2.seed()
            result1 = sl(ds)
            mvpa2.seed()
            result2 = sl(ds)  # must be faster
            assert_array_equal(result1, result2)

    def test_adhocsearchlight_perm_testing(self):
        # just a smoke test pretty much
        ds = datasets['3dmedium'].copy()
        #ds.samples += np.random.normal(size=ds.samples.shape)*10
        mvpa2.seed()
        ds.fa['voxel_indices'] = ds.fa.myspace
        from mvpa2.mappers.fx import mean_sample
        from mvpa2.clfs.stats import MCNullDist
        permutator = AttributePermutator('targets', count=8, limit='chunks')
        distr_est = MCNullDist(permutator,
                               tail='left',
                               enable_ca=['dist_samples'])
        slargs = (kNN(1),
                  NFoldPartitioner(0.5, selection_strategy='random', count=9))
        slkwargs = dict(radius=1, postproc=mean_sample())

        sl_nodistr = sphere_m1nnsearchlight(*slargs, **slkwargs)
        skip_if_no_external('scipy')  # needed for null_t
        sl = sphere_m1nnsearchlight(*slargs,
                                    null_dist=distr_est,
                                    enable_ca=['null_t'],
                                    reuse_neighbors=True,
                                    **slkwargs)
        mvpa2.seed()
        res_nodistr = sl_nodistr(ds)
        mvpa2.seed()
        res = sl(ds)
        # verify that we at least got the same main result
        # ah (yoh) -- null dist is estimated before the main
        # estimate so we can't guarantee correspondence :-/
        # assert_array_equal(res_nodistr, res)
        # only resemblance (TODO, may be we want to get/setstate
        # for rng before null_dist.fit?)

        # and dimensions correspond
        assert_array_equal(distr_est.ca.dist_samples.shape,
                           (1, ds.nfeatures, 8))
        assert_array_equal(sl.ca.null_t.samples.shape, (1, ds.nfeatures))

    def test_partial_searchlight_with_full_report(self):
        ds = self.dataset.copy()
        center_ids = np.zeros(ds.nfeatures, dtype='bool')
        center_ids[[3, 50]] = True
        ds.fa['center_ids'] = center_ids
        # compute N-1 cross-validation for each sphere
        cv = CrossValidation(GNB(), NFoldPartitioner())
        # contruct diameter 1 (or just radius 0) searchlight
        # one time give center ids as a list, the other one takes it from the
        # dataset itself
        sls = (
            sphere_searchlight(cv, radius=0, center_ids=[3, 50]),
            sphere_searchlight(None, radius=0, center_ids=[3, 50]),
            sphere_searchlight(cv, radius=0, center_ids='center_ids'),
        )
        for sl in sls:
            # assure that we could set cv post constructor
            if sl.datameasure is None:
                sl.datameasure = cv
            # run searchlight
            results = sl(ds)
            # only two spheres but error for all CV-folds
            self.assertEqual(results.shape, (len(self.dataset.UC), 2))
            # Test if results hold if we "set" a "new" datameasure
            sl.datameasure = CrossValidation(GNB(), NFoldPartitioner())
            results2 = sl(ds)
            assert_array_almost_equal(results, results2)

        # test if we graciously puke if center_ids are out of bounds
        dataset0 = ds[:, :50]  # so we have no 50th feature
        self.assertRaises(IndexError, sls[0], dataset0)
        # but it should be fine on the one that gets the ids from the dataset
        # itself
        results = sl(dataset0)
        assert_equal(results.nfeatures, 1)
        # check whether roi_seeds are correct
        sl = sphere_searchlight(lambda x: np.vstack(
            (x.fa.roi_seed, x.samples)),
                                radius=1,
                                add_center_fa=True,
                                center_ids=[12])
        res = sl(ds)
        assert_array_equal(
            res.samples[1:, res.samples[0].astype('bool')].squeeze(),
            ds.samples[:, 12])

    def test_partial_searchlight_with_confusion_matrix(self):
        ds = self.dataset
        from mvpa2.clfs.stats import MCNullDist
        from mvpa2.mappers.fx import mean_sample, sum_sample

        # compute N-1 cross-validation for each sphere
        cm = ConfusionMatrix(labels=ds.UT)
        cv = CrossValidation(
            sample_clf_lin,
            NFoldPartitioner(),
            # we have to assure that matrix does not get flatted by
            # first vstack in cv and then hstack in searchlight --
            # thus 2 leading dimensions
            # TODO: RF? make searchlight/crossval smarter?
            errorfx=lambda *a: cm(*a)[None, None, :])
        # contruct diameter 2 (or just radius 1) searchlight
        sl = sphere_searchlight(cv, radius=1, center_ids=[3, 5, 50])

        # our regular searchlight -- to compare results
        cv_gross = CrossValidation(sample_clf_lin, NFoldPartitioner())
        sl_gross = sphere_searchlight(cv_gross,
                                      radius=1,
                                      center_ids=[3, 5, 50])

        # run searchlights
        res = sl(ds)
        res_gross = sl_gross(ds)

        # only two spheres but error for all CV-folds and complete confusion matrix
        assert_equal(res.shape, (len(ds.UC), 3, len(ds.UT), len(ds.UT)))
        assert_equal(res_gross.shape, (len(ds.UC), 3))

        # briefly inspect the confusion matrices
        mat = res.samples
        # since input dataset is probably balanced (otherwise adjust
        # to be per label): sum within columns (thus axis=-2) should
        # be identical to per-class/chunk number of samples
        samples_per_classchunk = len(ds) / (len(ds.UT) * len(ds.UC))
        ok_(np.all(np.sum(mat, axis=-2) == samples_per_classchunk))
        # and if we compute accuracies manually -- they should
        # correspond to the one from sl_gross
        assert_array_almost_equal(
            res_gross.samples,
            # from accuracies to errors
            1 - (mat[..., 0, 0] + mat[..., 1, 1]).astype(float) /
            (2 * samples_per_classchunk))

        # and now for those who remained sited -- lets perform H0 MC
        # testing of this searchlight... just a silly one with minimal
        # number of permutations
        no_permutations = 10
        permutator = AttributePermutator('targets', count=no_permutations)

        # once again -- need explicit leading dimension to avoid
        # vstacking during cross-validation
        cv.postproc = lambda x: sum_sample()(x)[None, :]

        sl = sphere_searchlight(cv,
                                radius=1,
                                center_ids=[3, 5, 50],
                                null_dist=MCNullDist(
                                    permutator,
                                    tail='right',
                                    enable_ca=['dist_samples']))
        res_perm = sl(ds)
        # XXX all of the res_perm, sl.ca.null_prob and
        #     sl.null_dist.ca.dist_samples carry a degenerate leading
        #     dimension which was probably due to introduced new axis
        #     above within cv.postproc
        assert_equal(res_perm.shape, (1, 3, 2, 2))
        assert_equal(sl.null_dist.ca.dist_samples.shape,
                     res_perm.shape + (no_permutations, ))
        assert_equal(sl.ca.null_prob.shape, res_perm.shape)
        # just to make sure ;)
        ok_(np.all(sl.ca.null_prob.samples >= 0))
        ok_(np.all(sl.ca.null_prob.samples <= 1))

        # we should have got sums of hits across the splits
        assert_array_equal(np.sum(mat, axis=0), res_perm.samples[0])

    def test_chi_square_searchlight(self):
        # only do partial to save time

        # Can't yet do this since test_searchlight isn't yet "under nose"
        #skip_if_no_external('scipy')
        if not externals.exists('scipy'):
            return

        from mvpa2.misc.stats import chisquare

        cv = CrossValidation(sample_clf_lin,
                             NFoldPartitioner(),
                             enable_ca=['stats'])

        def getconfusion(data):
            cv(data)
            return chisquare(cv.ca.stats.matrix)[0]

        sl = sphere_searchlight(getconfusion, radius=0, center_ids=[3, 50])

        # run searchlight
        results = sl(self.dataset)
        self.assertTrue(results.nfeatures == 2)

    def test_1d_multispace_searchlight(self):
        ds = Dataset([np.arange(6)])
        ds.fa['coord1'] = np.repeat(np.arange(3), 2)
        # add a second space to the dataset
        ds.fa['coord2'] = np.tile(np.arange(2), 3)
        measure = lambda x: "+".join([str(x) for x in x.samples[0]])
        # simply select each feature once
        res = Searchlight(measure,
                          IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(0)),
                          nproc=1)(ds)
        assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']])
        res = Searchlight(measure,
                          IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(1)),
                          nproc=1)(ds)
        assert_array_equal(res.samples,
                           [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']])
        res = Searchlight(measure,
                          IndexQueryEngine(coord1=Sphere(1), coord2=Sphere(0)),
                          nproc=1)(ds)
        assert_array_equal(res.samples,
                           [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])

    #@sweepargs(regr=regrswh[:])
    @reseed_rng()
    def test_regression_with_additional_sa(self):
        regr = regrswh[:][0]
        ds = datasets['3dsmall'].copy()
        ds.fa['voxel_indices'] = ds.fa.myspace

        # Create a new sample attribute which will be used along with
        # every searchlight
        ds.sa['beh'] = np.random.normal(size=(ds.nsamples, 2))

        # and now for fun -- lets create custom linar regression
        # targets out of some random feature and beh linearly combined
        rfeature = np.random.randint(ds.nfeatures)
        ds.sa.targets = np.dot(
            np.hstack((ds.sa.beh, ds.samples[:, rfeature:rfeature + 1])),
            np.array([0.3, 0.2, 0.3]))

        class CrossValidationWithBeh(CrossValidation):
            """An adapter for regular CV which would hstack
               sa.beh to the searchlighting ds"""
            def _call(self, ds):
                return CrossValidation._call(
                    self, Dataset(np.hstack((ds, ds.sa.beh)), sa=ds.sa))

        cvbeh = CrossValidationWithBeh(regr,
                                       OddEvenPartitioner(),
                                       errorfx=corr_error)
        # regular cv
        cv = CrossValidation(regr, OddEvenPartitioner(), errorfx=corr_error)

        slbeh = sphere_searchlight(cvbeh, radius=1)
        slmapbeh = slbeh(ds)
        sl = sphere_searchlight(cv, radius=1)
        slmap = sl(ds)

        assert_equal(slmap.shape, (2, ds.nfeatures))
        # SL which had access to beh should have got for sure better
        # results especially in the vicinity of the chosen feature...
        features = sl.queryengine.query_byid(rfeature)
        assert_array_lequal(slmapbeh.samples[:, features],
                            slmap.samples[:, features])

        # elsewhere they should tend to be better but not guaranteed

    @labile(5, 1)
    def test_usecase_concordancesl(self):
        import numpy as np
        from mvpa2.base.dataset import vstack
        from mvpa2.mappers.fx import mean_sample

        # Take our sample 3d dataset
        ds1 = datasets['3dsmall'].copy(deep=True)
        ds1.fa['voxel_indices'] = ds1.fa.myspace
        ds1.sa['subject'] = [1
                             ]  # not really necessary -- but let's for clarity
        ds1 = mean_sample()(
            ds1)  # so we get just a single representative sample

        def corr12(ds):
            corr = np.corrcoef(ds.samples)
            assert (corr.shape == (2, 2))  # for paranoid ones
            return corr[0, 1]

        for nsc, thr, thr_mean in ((0, 1.0, 1.0),
                                   (0.1, 0.3, 0.8)):  # just a bit of noise
            ds2 = ds1.copy(deep=True)  # make a copy for the 2nd subject
            ds2.sa['subject'] = [2]
            ds2.samples += nsc * np.random.normal(size=ds1.shape)

            # make sure that both have the same voxel indices
            assert (np.all(ds1.fa.voxel_indices == ds2.fa.voxel_indices))
            ds_both = vstack((ds1, ds2))  # join 2 images into a single dataset
            # with .sa.subject distinguishing both

            sl = sphere_searchlight(corr12, radius=2)
            slmap = sl(ds_both)
            ok_(np.all(slmap.samples >= thr))
            ok_(np.mean(slmap.samples) >= thr)

    def test_swaroop_case(self):
        """Test hdf5 backend to pass results on Swaroop's usecase
        """
        skip_if_no_external('h5py')
        from mvpa2.measures.base import Measure

        class sw_measure(Measure):
            def __init__(self):
                Measure.__init__(self, auto_train=True)

            def _call(self, dataset):
                # For performance measures -- increase to 50-200
                # np.sum here is just to get some meaningful value in
                # them
                #return np.ones(shape=(2, 2))*np.sum(dataset)
                return Dataset(
                    np.array([{
                        'd': np.ones(shape=(5, 5)) * np.sum(dataset)
                    }],
                             dtype=object))

        results = []
        ds = datasets['3dsmall'].copy(deep=True)
        ds.fa['voxel_indices'] = ds.fa.myspace

        our_custom_prefix = tempfile.mktemp()
        for backend in ['native'] + \
                (externals.exists('h5py') and ['hdf5'] or []):
            sl = sphere_searchlight(sw_measure(),
                                    radius=1,
                                    tmp_prefix=our_custom_prefix,
                                    results_backend=backend)
            t0 = time.time()
            results.append(np.asanyarray(sl(ds)))
            # print "Done for backend %s in %d sec" % (backend, time.time() - t0)
        # because of swaroop's ad-hoc (who only could recommend such
        # a construct?) use case, and absent fancy working assert_objectarray_equal
        # let's compare manually
        #assert_objectarray_equal(*results)
        if not externals.exists('h5py'):
            self.assertRaises(RuntimeError,
                              sphere_searchlight,
                              sw_measure(),
                              results_backend='hdf5')
            raise SkipTest('h5py required for test of backend="hdf5"')
        assert_equal(results[0].shape, results[1].shape)
        results = [r.flatten() for r in results]
        for x, y in zip(*results):
            assert_equal(x.keys(), y.keys())
            assert_array_equal(x['d'], y['d'])
        # verify that no junk is left behind
        tempfiles = glob.glob(our_custom_prefix + '*')
        assert_equal(len(tempfiles), 0)

    def test_nblocks(self):
        skip_if_no_external('pprocess')
        # just a basic test to see that we are getting the same
        # results with different nblocks
        ds = datasets['3dsmall'].copy(deep=True)[:, :13]
        ds.fa['voxel_indices'] = ds.fa.myspace
        cv = CrossValidation(GNB(), OddEvenPartitioner())
        res1 = sphere_searchlight(cv, radius=1, nproc=2)(ds)
        res2 = sphere_searchlight(cv, radius=1, nproc=2, nblocks=5)(ds)
        assert_array_equal(res1, res2)

    def test_custom_results_fx_logic(self):
        # results_fx was introduced for the blow-up-the-memory-Swaroop
        # where keeping all intermediate results of the dark-magic SL
        # hyperalignment is not feasible.  So it is desired to split
        # searchlight computation in more blocks while composing the
        # target result "on-the-fly" from available so far results.
        #
        # Implementation relies on using generators feeding the
        # results_fx with fresh results whenever those become
        # available.
        #
        # This test/example's "measure" creates files which should be
        # handled by the results_fx function and removed in this case
        # to check if we indeed have desired high number of blocks while
        # only limited nproc.
        skip_if_no_external('pprocess')

        tfile = tempfile.mktemp('mvpa', 'test-sl')

        ds = datasets['3dsmall'].copy()[:, :25]  # smaller copy
        ds.fa['voxel_indices'] = ds.fa.myspace
        ds.fa['feature_id'] = np.arange(ds.nfeatures)

        nproc = 3  # it is not about computing -- so we will can
        # start more processes than possibly having CPUs just to test
        nblocks = nproc * 7
        # figure out max number of features to be given to any proc_block
        # yoh: not sure why I had to +1 here... but now it became more robust and
        # still seems to be doing what was demanded so be it
        max_block = int(ceil(ds.nfeatures / float(nblocks)) + 1)

        def print_(s, *args):
            """For local debugging"""
            #print s, args
            pass

        def results_fx(sl=None, dataset=None, roi_ids=None, results=None):
            """It will "process" the results by removing those files
               generated inside the measure
            """
            res = []
            print_("READY")
            for x in results:
                ok_(isinstance(x, list))
                res.append(x)
                print_("R: ", x)
                for r in x:
                    # Can happen if we requested those .ca's enabled
                    # -- then automagically _proc_block would wrap
                    # results in a dataset... Originally detected by
                    # running with MVPA_DEBUG=.* which triggered
                    # enabling all ca's
                    if is_datasetlike(r):
                        r = np.asscalar(r.samples)
                    os.unlink(r)  # remove generated file
                print_("WAITING")

            results_ds = hstack(sum(res, []))

            # store the center ids as a feature attribute since we use
            # them for testing
            results_ds.fa['center_ids'] = roi_ids
            return results_ds

        def results_postproc_fx(results):
            for ds in results:
                ds.fa['test_postproc'] = np.atleast_1d(ds.a.roi_center_ids**2)
            return results

        def measure(ds):
            """The "measure" will check if a run with the same "index" from
               previous block has been processed by now
            """
            f = '%s+%03d' % (tfile, ds.fa.feature_id[0] % (max_block * nproc))
            print_("FID:%d f:%s" % (ds.fa.feature_id[0], f))

            # allow for up to few seconds to wait for the file to
            # disappear -- i.e. its result from previous "block" was
            # processed
            t0 = time.time()
            while os.path.exists(f) and time.time() - t0 < 4.:
                time.sleep(0.5)  # so it does take time to compute the measure
                pass
            if os.path.exists(f):
                print_("ERROR: ", f)
                raise AssertionError(
                    "File %s must have been processed by now" % f)
            open(f, 'w').write(
                'XXX')  # signal that we have computing this measure
            print_("RES: %s" % f)
            return f

        sl = sphere_searchlight(measure,
                                radius=0,
                                nproc=nproc,
                                nblocks=nblocks,
                                results_postproc_fx=results_postproc_fx,
                                results_fx=results_fx,
                                center_ids=np.arange(ds.nfeatures))

        assert_equal(len(glob.glob(tfile + '*')), 0)  # so no junk around
        try:
            res = sl(ds)
            assert_equal(res.nfeatures, ds.nfeatures)
            # verify that we did have results_postproc_fx called
            assert_array_equal(res.fa.test_postproc,
                               np.power(res.fa.center_ids, 2))
        finally:
            # remove those generated left-over files
            for f in glob.glob(tfile + '*'):
                os.unlink(f)
Exemple #34
0
def get_dsm_roi_xval1_firstlev(ds,
                               rois,
                               zscore_ds=True,
                               part=OddEvenPartitioner(),
                               cond_chunk='condition',
                               fisher=False):
    """ Obtain second-order dissimilarities between ROIs. This version
    cross-validates at the first level and returns only the first level,
    without distances between ROIs

    Arguments
    --------
    ds: dataset
    rois: dict
        each item in the dictionary must be a tuple where the 0th element is
        the center of the roi, and the 1st element is a list of ids
    zscore_ds: bool
        is the dset already zscored?
    part: partitioner
    cond_chunk: str
        across which sample attribute to perform mean group sample
    fisher: bool
        whether to fisher-transform the correlations before averaging across folds

    Returns
    -------
    dataset containing first level dsm of shape (nrois, ncond**2)
    """
    #ds = h5load(fns.betafn(subnr))
    #ds = ds[:, mask_]
    #ds = ds[ds.sa.condition != 'self']

    # set up oddeven partition
    #part = OddEvenPartitioner()

    mgs = mean_group_sample([cond_chunk])
    dissims_folds = []
    folds = 1
    for ds_ in part.generate(ds):
        print("Running fold {0}".format(folds))
        ds_1 = ds_[ds_.sa.partitions == 1]
        ds_2 = ds_[ds_.sa.partitions == 2]

        ds_1 = mgs(ds_1)
        ds_2 = mgs(ds_2)
        if ds_1.nsamples >= 4 and zscore_ds:
            zscore(ds_1, chunks_attr='chunks')
            zscore(ds_2, chunks_attr='chunks')
        assert (ds_1.samples.shape == ds_2.samples.shape)

        # first generate first-order rdms cross-validated across folds
        names = []
        centers = []
        dissims = []
        for roi, (center, ids) in rois.iteritems():
            names.append(roi)
            centers.append(center)

            sample1_roi = ds_1.samples[:, ids]
            sample2_roi = ds_2.samples[:, ids]

            dissim_roi = corrcoefxy(sample1_roi.T,
                                    sample2_roi.T,
                                    fisher=fisher)
            nsamples = ds_1.nsamples
            assert (dissim_roi.shape == (nsamples, nsamples))

            dissims.append(
                dissim_roi.flatten())  # now the RDM is not symmetrical anymore
        dissims_folds.append(np.array(dissims))
        folds += 1

    # average across folds
    dissims_folds = np.array(dissims_folds).mean(axis=0)
    assert (dissims_folds.shape == (len(names), nsamples**2))

    if fisher:
        dissims_folds = np.tanh(dissims_folds)

    dissims_folds = dataset_wizard(dissims_folds, targets=names)
    dissims_folds.sa['centers'] = centers

    return dissims_folds
Exemple #35
0
def group_sample_loser_measure(attrs=('targets', )):
    '''takes loser after meaning over attrs'''
    return ChainNode((mean_group_sample(attrs), sample_loser_measure()))
Exemple #36
0
        roi_neighborhood=Sphere(6),
        nruns=3, nsubjects=2,
        noise_subject_n=1, noise_subject_std=5, noise_subject_smooth=5,
        noise_independent_std=4, noise_independent_smooth=1.5,
        noise_common_n=1, noise_common_std=3)

    # just a little helper
    def get2d(ds):
        return dss[0].a.mapper.reverse(ds)

    import pylab as pl
    pl.clf()
    DS = dsvstack(dss)
    # Sample plots
    for s in [0, 1]:
        ds2 = get2d(dss[0])
        for r in [0, 1]:
            pl.subplot(3,3,1+r+s*3); pl.imshow(ds2[ds2.sa.chunks == r].samples[0], interpolation='nearest'); pl.ylabel('subj%d' % s);  pl.xlabel('run1');
        pl.subplot(3,3,3+s*3); pl.imshow(get2d(mean_group_sample(['dissimilarity'])(dss[0]).samples)[0], interpolation='nearest'); pl.xlabel('mean');

    ds = dsvstack(dss)
    ds.a['mapper'] = dss[0].a.mapper
    ds_mean = mean_group_sample(['dissimilarity', 'chunks'])(ds)
    for r in [0, 1]:
        ds_mean_run0 = ds.a.mapper.reverse(ds_mean[ds_mean.chunks == r])
        pl.subplot(3,3,1+r+2*3); pl.imshow(ds_mean_run0.samples[0], interpolation='nearest'); pl.ylabel('mean(subj)');  pl.xlabel('run%d' % r)
    ds_global_mean = mean_group_sample(['dissimilarity'])(ds)
    pl.subplot(3,3,3+2*3); pl.imshow(get2d(ds_global_mean).samples[0], interpolation='nearest'); pl.xlabel('mean');

pl.show()