Example #1
0
def test_scattered_neighborhoods():
    radius = 1
    sphere = ne.Sphere(radius)
    coords = range(50)

    scoords, sidx = ne.scatter_neighborhoods(sphere,
                                             coords,
                                             deterministic=False)
    # for this specific case of 1d coordinates the coords and idx should be
    # identical
    assert_array_equal(scoords, sidx)
    # minimal difference of successive coordinates must be larger than the
    # radius of the spheres. Test only works for 1d coords and sorted return
    # values
    assert (np.diff(scoords).min() > radius)

    # now the same for the case where a particular coordinate appears multiple
    # times
    coords = range(10) + range(10)
    scoords, sidx = ne.scatter_neighborhoods(sphere,
                                             coords,
                                             deterministic=False)
    sidx = sorted(sidx)
    assert_array_equal(scoords, sidx[:5])
    assert_array_equal(scoords, [i - 10 for i in sidx[5:]])
Example #2
0
def test_scattered_neighborhoods():
    radius = 1
    sphere = ne.Sphere(radius)
    coords = range(50)

    scoords, sidx = ne.scatter_neighborhoods(sphere, coords,
                                             deterministic=False)
    # for this specific case of 1d coordinates the coords and idx should be
    # identical
    assert_array_equal(scoords, sidx)
    # minimal difference of successive coordinates must be larger than the
    # radius of the spheres. Test only works for 1d coords and sorted return
    # values
    assert(np.diff(scoords).min() > radius)

    # now the same for the case where a particular coordinate appears multiple
    # times
    coords = range(10) + range(10)
    scoords, sidx = ne.scatter_neighborhoods(sphere, coords,
                                             deterministic=False)
    sidx = sorted(sidx)
    assert_array_equal(scoords, sidx[:5])
    assert_array_equal(scoords, [i - 10 for i in sidx[5:]])
 def test_searchlight_hyperalignment(self):
     skip_if_no_external('scipy')
     skip_if_no_external('h5py')
     ds_orig = datasets['3dsmall'].copy()[:, :15]
     ds_orig.fa['voxel_indices'] = ds_orig.fa.myspace
     space = 'voxel_indices'
     # total number of datasets for the analysis
     nds = 5
     zscore(ds_orig, chunks_attr=None)
     dss = [ds_orig]
     # create a few distorted datasets to match the desired number of datasets
     # not sure if this truly mimics the real data, but at least we can test
     # implementation
     while len(dss) < nds - 1:
         sd = local_random_affine_transformations(
             ds_orig,
             scatter_neighborhoods(
                 Sphere(1),
                 ds_orig.fa[space].value, deterministic=True)[1],
             Sphere(2),
             space=space,
             scale_fac=1.0, shift_fac=0.0)
         # sometimes above function returns dataset with nans, infs, we don't want that.
         if np.sum(np.isnan(sd.samples)+np.isinf(sd.samples)) == 0 \
                 and np.all(sd.samples.std(0)):
             dss.append(sd)
     ds_orig_noisy = ds_orig.copy()
     ds_orig_noisy.samples += 0.1 * np.random.random(size=ds_orig_noisy.shape)
     dss.append(ds_orig_noisy)
     _ = [zscore(sd, chunks_attr=None) for sd in dss[1:]]
     # we should have some distortion
     for ds in dss[1:]:
         assert_false(np.all(ds_orig.samples == ds.samples))
     # testing checks
     slhyp = SearchlightHyperalignment(ref_ds=1, exclude_from_model=[1])
     self.assertRaises(ValueError, slhyp, dss[:3])
     slhyp = SearchlightHyperalignment(ref_ds=3)
     self.assertRaises(ValueError, slhyp, dss[:3])
     # store projections for each mapper separately
     projs = list()
     # run the algorithm with all combinations of the two major parameters
     # for projection calculation.
     for kwargs in [{'combine_neighbormappers': True, 'nproc': 2},
                    {'combine_neighbormappers': True, 'dtype': 'float64', 'compute_recon': True},
                    {'combine_neighbormappers': True, 'exclude_from_model': [2, 4]},
                    {'combine_neighbormappers': False},
                    {'combine_neighbormappers': False, 'mask_node_ids': np.arange(dss[0].nfeatures).tolist()},
                    {'combine_neighbormappers': True, 'sparse_radius': 1},
                    {'combine_neighbormappers': True, 'nblocks': 2}]:
         slhyp = SearchlightHyperalignment(radius=2, **kwargs)
         mappers = slhyp(dss)
         # one mapper per input ds
         assert_equal(len(mappers), nds)
         projs.append(mappers)
     # some checks
     for midx in range(nds):
         # making sure mask_node_ids options works as expected
         assert_array_almost_equal(projs[3][midx].proj.todense(),
                                   projs[4][midx].proj.todense())
         # recon check
         assert_array_almost_equal(projs[0][midx].proj.todense(),
                                   projs[1][midx].recon.T.todense(), decimal=5)
         assert_equal(projs[1][midx].proj.dtype, 'float64')
         assert_equal(projs[0][midx].proj.dtype, 'float32')
     # making sure the projections make sense
     for proj in projs:
         # no .max on sparse matrices on older scipy (e.g. on precise) so conver to array first
         max_weight = proj[0].proj.toarray().max(0).squeeze()
         diag_weight = proj[0].proj.diagonal()
         # Check to make sure diagonal is the max weight, in almost all rows for reference subject
         assert(np.sum(max_weight == diag_weight) / float(len(diag_weight)) > 0.90)
         # and not true for other subjects
         for i in range(1, nds - 1):
             assert(np.sum(proj[i].proj.toarray().max(0).squeeze() == proj[i].proj.diagonal())
                    / float(proj[i].proj.shape[0]) < 0.80)
         # Check to make sure projection weights match across duplicate datasets
         max_weight = proj[-1].proj.toarray().max(0).squeeze()
         diag_weight = proj[-1].proj.diagonal()
         # Check to make sure diagonal is the max weight, in almost all rows for reference subject
         assert(np.sum(max_weight == diag_weight) / float(len(diag_weight)) > 0.90)
     # project data
     dss_hyper = [hm.forward(sd) for hm, sd in zip(projs[0], dss)]
     _ = [zscore(sd, chunks_attr=None) for sd in dss_hyper]
     ndcss = []
     nf = ds_orig.nfeatures
     for ds_hyper in dss_hyper:
         ndcs = np.diag(np.corrcoef(ds_hyper.samples.T,
                                    ds_orig.samples.T)[nf:, :nf],
                        k=0)
         ndcss += [ndcs]
     assert_true(np.median(ndcss[0]) > 0.9)
     # noisy copy of original dataset should be similar to original after hyperalignment
     assert_true(np.median(ndcss[-1]) > 0.9)
     assert_true(np.all([np.median(ndcs) > 0.2 for ndcs in ndcss[1:-2]]))
    def __call__(self, datasets):
        """Estimate mappers for each dataset using searchlight-based
        hyperalignment.

        Parameters
        ----------
          datasets : list or tuple of datasets

        Returns
        -------
        A list of trained StaticProjectionMappers of the same length as datasets
        """

        # Perform some checks first before modifying internal state
        params = self.params
        ndatasets = len(datasets)

        if len(datasets) <= 1:
            raise ValueError("SearchlightHyperalignment needs > 1 dataset to "
                             "operate on. Got: %d" % self.ndatasets)

        if params.ref_ds in params.exclude_from_model:
            raise ValueError("Requested reference dataset %i is also "
                             "in the exclude list." % params.ref_ds)

        if params.ref_ds >= ndatasets:
            raise ValueError("Requested reference dataset %i is out of "
                             "bounds. We have only %i datasets provided" %
                             (params.ref_ds, self.ndatasets))

        # The rest of the checks are just warnings
        self.ndatasets = ndatasets

        _shpaldebug("SearchlightHyperalignment %s for %i datasets" %
                    (self, self.ndatasets))

        selected = [
            _ for _ in range(ndatasets) if _ not in params.exclude_from_model
        ]
        ref_ds_train = selected.index(params.ref_ds)
        params.hyperalignment.params.ref_ds = ref_ds_train
        warning('Using %dth dataset as the reference dataset (%dth after '
                'excluding datasets)' % (params.ref_ds, ref_ds_train))
        if len(params.exclude_from_model) > 0:
            warning("These datasets will not participate in building common "
                    "model: %s" % params.exclude_from_model)

        if __debug__:
            # verify that datasets were zscored prior the alignment since it is
            # assumed/required preprocessing step
            for ids, ds in enumerate(datasets):
                for f, fname, tval in ((np.mean, 'means', 0), (np.std, 'stds',
                                                               1)):
                    vals = f(ds, axis=0)
                    vals_comp = np.abs(vals - tval) > 1e-5
                    if np.any(vals_comp):
                        warning(
                            '%d %s are too different (max diff=%g) from %d in '
                            'dataset %d to come from a zscored dataset. '
                            'Please zscore datasets first for correct operation '
                            '(unless if was intentional)' %
                            (np.sum(vals_comp), fname, np.max(
                                np.abs(vals)), tval, ids))

        # Setting up SearchlightHyperalignment
        # we need to know which original features where comprising the
        # individual SL ROIs
        _shpaldebug('Initializing FeatureSelectionHyperalignment.')
        hmeasure = FeatureSelectionHyperalignment(
            ref_ds=params.ref_ds,
            featsel=params.featsel,
            hyperalignment=params.hyperalignment,
            full_matrix=params.combine_neighbormappers,
            use_same_features=params.use_same_features,
            exclude_from_model=params.exclude_from_model,
            dtype=params.dtype)

        # Performing SL processing manually
        _shpaldebug("Setting up for searchlights")
        if params.nproc is None and externals.exists('pprocess'):
            import pprocess
            try:
                params.nproc = pprocess.get_number_of_cores() or 1
            except AttributeError:
                warning("pprocess version %s has no API to figure out maximal "
                        "number of cores. Using 1" %
                        externals.versions['pprocess'])
                params.nproc = 1

        # XXX I think this class should already accept a single dataset only.
        # It should have a ``space`` setting that names a sample attribute that
        # can be used to identify individual/original datasets.
        # Taking a single dataset as argument would be cleaner, because the
        # algorithm relies on the assumption that there is a coarse feature
        # alignment, i.e. the SL ROIs cover roughly the same area
        queryengines = self._get_trained_queryengines(datasets,
                                                      params.queryengine,
                                                      params.radius,
                                                      params.ref_ds)
        # For surface nodes to voxels queryengines, roi_seed hardly makes sense
        qe = queryengines[(0 if len(queryengines) == 1 else params.ref_ds)]
        if isinstance(qe, SurfaceVerticesQueryEngine):
            self.force_roi_seed = False
            if not self.params.combine_neighbormappers:
                raise NotImplementedError(
                    "Mapping from voxels to surface nodes is not "
                    "implmented yet. Try setting combine_neighbormappers to True."
                )
        self.nfeatures = datasets[params.ref_ds].nfeatures
        _shpaldebug("Performing Hyperalignment in searchlights")
        # Setting up centers for running SL Hyperalignment
        if params.sparse_radius is None:
            roi_ids = self._get_verified_ids(queryengines) \
                if params.mask_node_ids is None \
                else params.mask_node_ids
        else:
            if params.queryengine is not None:
                raise NotImplementedError(
                    "using sparse_radius whenever custom queryengine is "
                    "provided is not yet supported.")
            _shpaldebug("Setting up sparse neighborhood")
            from mvpa2.misc.neighborhood import scatter_neighborhoods
            if params.mask_node_ids is None:
                scoords, sidx = scatter_neighborhoods(
                    Sphere(params.sparse_radius),
                    datasets[params.ref_ds].fa.voxel_indices,
                    deterministic=True)
                roi_ids = sidx
            else:
                scoords, sidx = scatter_neighborhoods(
                    Sphere(params.sparse_radius),
                    datasets[params.ref_ds].fa.voxel_indices[
                        params.mask_node_ids],
                    deterministic=True)
                roi_ids = [params.mask_node_ids[sid] for sid in sidx]

        # Initialize projections
        _shpaldebug('Initializing projection matrices')
        self.projections = [
            csc_matrix((self.nfeatures, self.nfeatures), dtype=params.dtype)
            for isub in range(self.ndatasets)
        ]

        # compute
        if params.nproc is not None and params.nproc > 1:
            # split all target ROIs centers into `nproc` equally sized blocks
            nproc_needed = min(len(roi_ids), params.nproc)
            params.nblocks = nproc_needed \
                if params.nblocks is None else params.nblocks
            params.nblocks = min(len(roi_ids), params.nblocks)
            node_blocks = np.array_split(roi_ids, params.nblocks)
            # the next block sets up the infrastructure for parallel computing
            # this can easily be changed into a ParallelPython loop, if we
            # decide to have a PP job server in PyMVPA
            import pprocess
            p_results = pprocess.Map(limit=nproc_needed)
            if __debug__:
                debug(
                    'SLC', "Starting off %s child processes for nblocks=%i" %
                    (nproc_needed, params.nblocks))
            compute = p_results.manage(pprocess.MakeParallel(self._proc_block))
            seed = mvpa2.get_random_seed()
            for iblock, block in enumerate(node_blocks):
                # should we maybe deepcopy the measure to have a unique and
                # independent one per process?
                compute(block,
                        datasets,
                        copy.copy(hmeasure),
                        queryengines,
                        seed=seed,
                        iblock=iblock)
        else:
            # otherwise collect the results in an 1-item list
            _shpaldebug('Using 1 process to compute mappers.')
            if params.nblocks is None:
                params.nblocks = 1
            params.nblocks = min(len(roi_ids), params.nblocks)
            node_blocks = np.array_split(roi_ids, params.nblocks)
            p_results = [
                self._proc_block(block, datasets, hmeasure, queryengines)
                for block in node_blocks
            ]
        results_ds = self.__handle_all_results(p_results)
        # Dummy iterator for, you know, iteration
        list(results_ds)

        _shpaldebug(
            'Wrapping projection matrices into StaticProjectionMappers')
        self.projections = [
            StaticProjectionMapper(proj=proj, recon=proj.T)
            if params.compute_recon else StaticProjectionMapper(proj=proj)
            for proj in self.projections
        ]
        return self.projections
Example #5
0
def run(args):
    if os.path.isfile(args.payload) and args.payload.endswith('.py'):
        measure = script2obj(args.payload)
    elif args.payload == 'cv':
        if args.cv_learner is None or args.cv_partitioner is None:
            raise ValueError('cross-validation payload requires --learner and --partitioner')
        # get CV instance
        measure = get_crossvalidation_instance(
                    args.cv_learner, args.cv_partitioner, args.cv_errorfx,
                    args.cv_sampling_repetitions, args.cv_learner_space,
                    args.cv_balance_training, args.cv_permutations,
                    args.cv_avg_datafold_results, args.cv_prob_tail)
    else:
        raise RuntimeError("this should not happen")
    ds = arg2ds(args.data)
    if args.ds_preproc_fx is not None:
        ds = args.ds_preproc_fx(ds)
    # setup neighborhood
    # XXX add big switch to allow for setting up surface-based neighborhoods
    from mvpa2.misc.neighborhood import IndexQueryEngine
    qe = IndexQueryEngine(**dict(args.neighbors))
    # determine ROIs
    rids = None     # all by default
    aggregate_fx = args.aggregate_fx
    if args.roi_attr is not None:
        # first figure out which roi features should be processed
        if len(args.roi_attr) == 1 and args.roi_attr[0] in ds.fa.keys():
            # name of an attribute -> pull non-zeroes
            rids = ds.fa[args.roi_attr[0]].value.nonzero()[0]
        else:
            # an expression?
            from .cmd_select import _eval_attr_expr
            rids = _eval_attr_expr(args.roi_attr, ds.fa).nonzero()[0]

    seed_ids = None
    if args.scatter_rois is not None:
        # scatter_neighborhoods among available ids if was requested
        from mvpa2.misc.neighborhood import scatter_neighborhoods
        attr, nb = args.scatter_rois
        coords = ds.fa[attr].value
        if rids is not None:
            # select only those which were chosen by ROI
            coords = coords[rids]
        _, seed_ids = scatter_neighborhoods(nb, coords)
        if aggregate_fx is None:
            # no custom one given -> use default "fill in" function
            aggregate_fx = _fill_in_scattered_results
            if args.enable_ca is None:
                args.enable_ca = ['roi_feature_ids']
            elif 'roi_feature_ids' not in args.enable_ca:
                args.enable_ca += ['roi_feature_ids']

    if seed_ids is None:
        roi_ids = rids
    else:
        if rids is not None:
            # we had to sub-select by scatterring among available rids
            # so we would need to get original ids
            roi_ids = rids[seed_ids]
        else:
            # scattering happened on entire feature-set
            roi_ids = seed_ids

    verbose(3, 'Attempting %i ROI analyses'
               % ((roi_ids is None) and ds.nfeatures or len(roi_ids)))

    from mvpa2.measures.searchlight import Searchlight

    sl = Searchlight(measure,
                     queryengine=qe,
                     roi_ids=roi_ids,
                     nproc=args.nproc,
                     results_backend=args.multiproc_backend,
                     results_fx=aggregate_fx,
                     enable_ca=args.enable_ca,
                     disable_ca=args.disable_ca)
    # XXX support me too!
    #                 add_center_fa
    #                 tmp_prefix
    #                 nblocks
    #                 null_dist
    # run
    res = sl(ds)
    if (seed_ids is not None) and ('mapper' in res.a):
        # strip the last mapper link in the chain, which would be the seed ID selection
        res.a['mapper'] = res.a.mapper[:-1]
    # XXX create more output
    # and store
    ds2hdf5(res, args.output, compression=args.hdf5_compression)
    return res
Example #6
0
def run(args):
    if os.path.isfile(args.payload) and args.payload.endswith('.py'):
        measure = script2obj(args.payload)
    elif args.payload == 'cv':
        if args.cv_learner is None or args.cv_partitioner is None:
            raise ValueError(
                'cross-validation payload requires --learner and --partitioner'
            )
        # get CV instance
        measure = get_crossvalidation_instance(
            args.cv_learner, args.cv_partitioner, args.cv_errorfx,
            args.cv_sampling_repetitions, args.cv_learner_space,
            args.cv_balance_training, args.cv_permutations,
            args.cv_avg_datafold_results, args.cv_prob_tail)
    else:
        raise RuntimeError("this should not happen")
    ds = arg2ds(args.data)
    if not args.ds_preproc_fx is None:
        ds = args.ds_preproc_fx(ds)
    # setup neighborhood
    # XXX add big switch to allow for setting up surface-based neighborhoods
    from mvpa2.misc.neighborhood import IndexQueryEngine
    qe = IndexQueryEngine(**dict(args.neighbors))
    # determine ROIs
    rids = None  # all by default
    aggregate_fx = args.aggregate_fx
    if args.roi_attr is not None:
        # first figure out which roi features should be processed
        if len(args.roi_attr) == 1 and args.roi_attr[0] in ds.fa.keys():
            # name of an attribute -> pull non-zeroes
            rids = ds.fa[args.roi_attr[0]].value.nonzero()[0]
        else:
            # an expression?
            from .cmd_select import _eval_attr_expr
            rids = _eval_attr_expr(args.roi_attr, ds.fa).nonzero()[0]

    seed_ids = None
    if args.scatter_rois is not None:
        # scatter_neighborhoods among available ids if was requested
        from mvpa2.misc.neighborhood import scatter_neighborhoods
        attr, nb = args.scatter_rois
        coords = ds.fa[attr].value
        if rids is not None:
            # select only those which were chosen by ROI
            coords = coords[rids]
        _, seed_ids = scatter_neighborhoods(nb, coords)
        if aggregate_fx is None:
            # no custom one given -> use default "fill in" function
            aggregate_fx = _fill_in_scattered_results
            if args.enable_ca is None:
                args.enable_ca = ['roi_feature_ids']
            elif 'roi_feature_ids' not in args.enable_ca:
                args.enable_ca += ['roi_feature_ids']

    if seed_ids is None:
        roi_ids = rids
    else:
        if rids is not None:
            # we had to sub-select by scatterring among available rids
            # so we would need to get original ids
            roi_ids = rids[seed_ids]
        else:
            # scattering happened on entire feature-set
            roi_ids = seed_ids

    verbose(
        3, 'Attempting %i ROI analyses' %
        ((roi_ids is None) and ds.nfeatures or len(roi_ids)))

    from mvpa2.measures.searchlight import Searchlight

    sl = Searchlight(measure,
                     queryengine=qe,
                     roi_ids=roi_ids,
                     nproc=args.nproc,
                     results_backend=args.multiproc_backend,
                     results_fx=aggregate_fx,
                     enable_ca=args.enable_ca,
                     disable_ca=args.disable_ca)
    # XXX support me too!
    #                 add_center_fa
    #                 tmp_prefix
    #                 nblocks
    #                 null_dist
    # run
    res = sl(ds)
    if (seed_ids is not None) and ('mapper' in res.a):
        # strip the last mapper link in the chain, which would be the seed ID selection
        res.a['mapper'] = res.a.mapper[:-1]
    # XXX create more output
    # and store
    ds2hdf5(res, args.output, compression=args.hdf5_compression)
    return res
    def __call__(self, datasets):
        """Estimate mappers for each dataset using searchlight-based
        hyperalignment.

        Parameters
        ----------
          datasets : list or tuple of datasets

        Returns
        -------
        A list of trained StaticProjectionMappers of the same length as datasets
        """

        # Perform some checks first before modifying internal state
        params = self.params
        ndatasets = len(datasets)

        if len(datasets) <= 1:
            raise ValueError("SearchlightHyperalignment needs > 1 dataset to "
                             "operate on. Got: %d" % self.ndatasets)

        if params.ref_ds in params.exclude_from_model:
            raise ValueError("Requested reference dataset %i is also "
                             "in the exclude list." % params.ref_ds)

        if params.ref_ds >= ndatasets:
            raise ValueError("Requested reference dataset %i is out of "
                             "bounds. We have only %i datasets provided"
                             % (params.ref_ds, self.ndatasets))

        # The rest of the checks are just warnings
        self.ndatasets = ndatasets

        _shpaldebug("SearchlightHyperalignment %s for %i datasets"
                    % (self, self.ndatasets))

        if params.ref_ds != params.hyperalignment.params.ref_ds:
            warning('Supplied ref_ds & hyperalignment instance ref_ds:%d differ.'
                    % params.hyperalignment.params.ref_ds)
            warning('Using default hyperalignment instance with ref_ds: %d' % params.ref_ds)
            params.hyperalignment = Hyperalignment(ref_ds=params.ref_ds)
        if len(params.exclude_from_model) > 0:
            warning("These datasets will not participate in building common "
                    "model: %s" % params.exclude_from_model)

        if __debug__:
            # verify that datasets were zscored prior the alignment since it is
            # assumed/required preprocessing step
            for ids, ds in enumerate(datasets):
                for f, fname, tval in ((np.mean, 'means', 0),
                                       (np.std, 'stds', 1)):
                    vals = f(ds, axis=0)
                    vals_comp = np.abs(vals - tval) > 1e-5
                    if np.any(vals_comp):
                        warning('%d %s are too different (max diff=%g) from %d in '
                                'dataset %d to come from a zscored dataset. '
                                'Please zscore datasets first for correct operation '
                                '(unless if was intentional)'
                                % (np.sum(vals_comp), fname,
                                   np.max(np.abs(vals)), tval, ids))

        # Setting up SearchlightHyperalignment
        # we need to know which original features where comprising the
        # individual SL ROIs
        _shpaldebug('Initializing FeatureSelectionHyperalignment.')
        hmeasure = FeatureSelectionHyperalignment(
            featsel=params.featsel,
            hyperalignment=params.hyperalignment,
            full_matrix=params.combine_neighbormappers,
            use_same_features=params.use_same_features,
            exclude_from_model=params.exclude_from_model,
            dtype=params.dtype)

        # Performing SL processing manually
        _shpaldebug("Setting up for searchlights")
        if params.nproc is None and externals.exists('pprocess'):
            import pprocess
            try:
                params.nproc = pprocess.get_number_of_cores() or 1
            except AttributeError:
                warning("pprocess version %s has no API to figure out maximal "
                        "number of cores. Using 1"
                        % externals.versions['pprocess'])
                params.nproc = 1

        # XXX I think this class should already accept a single dataset only.
        # It should have a ``space`` setting that names a sample attribute that
        # can be used to identify individual/original datasets.
        # Taking a single dataset as argument would be cleaner, because the
        # algorithm relies on the assumption that there is a coarse feature
        # alignment, i.e. the SL ROIs cover roughly the same area
        queryengines = self._get_trained_queryengines(
            datasets, params.queryengine, params.radius, params.ref_ds)
        # For surface nodes to voxels queryengines, roi_seed hardly makes sense
        if isinstance(queryengines[params.ref_ds], SurfaceVerticesQueryEngine):
            self.force_roi_seed = False
            if not self.params.combine_neighbormappers:
                raise NotImplementedError("Mapping from voxels to surface nodes is not "
                        "implmented yet. Try setting combine_neighbormappers to True.")
        self.nfeatures = datasets[params.ref_ds].nfeatures
        _shpaldebug("Performing Hyperalignment in searchlights")
        # Setting up centers for running SL Hyperalignment
        if params.sparse_radius is None:
            roi_ids = self._get_verified_ids(queryengines) \
                if params.mask_node_ids is None \
                else params.mask_node_ids
        else:
            if params.queryengine is not None:
                raise NotImplementedError(
                    "using sparse_radius whenever custom queryengine is "
                    "provided is not yet supported.")
            _shpaldebug("Setting up sparse neighborhood")
            from mvpa2.misc.neighborhood import scatter_neighborhoods
            if params.mask_node_ids is None:
                scoords, sidx = scatter_neighborhoods(
                    Sphere(params.sparse_radius),
                    datasets[params.ref_ds].fa.voxel_indices,
                    deterministic=True)
                roi_ids = sidx
            else:
                scoords, sidx = scatter_neighborhoods(
                    Sphere(params.sparse_radius),
                    datasets[params.ref_ds].fa.voxel_indices[params.mask_node_ids],
                    deterministic=True)
                roi_ids = [params.mask_node_ids[sid] for sid in sidx]

        # Initialize projections
        _shpaldebug('Initializing projection matrices')
        self.projections = [
            csc_matrix((self.nfeatures, self.nfeatures), dtype=params.dtype)
            for isub in range(self.ndatasets)]

        # compute
        if params.nproc is not None and params.nproc > 1:
            # split all target ROIs centers into `nproc` equally sized blocks
            nproc_needed = min(len(roi_ids), params.nproc)
            params.nblocks = nproc_needed \
                if params.nblocks is None else params.nblocks
            params.nblocks = min(len(roi_ids), params.nblocks)
            node_blocks = np.array_split(roi_ids, params.nblocks)
            # the next block sets up the infrastructure for parallel computing
            # this can easily be changed into a ParallelPython loop, if we
            # decide to have a PP job server in PyMVPA
            import pprocess
            p_results = pprocess.Map(limit=nproc_needed)
            if __debug__:
                debug('SLC', "Starting off %s child processes for nblocks=%i"
                      % (nproc_needed, params.nblocks))
            compute = p_results.manage(
                        pprocess.MakeParallel(self._proc_block))
            seed = mvpa2.get_random_seed()
            for iblock, block in enumerate(node_blocks):
                # should we maybe deepcopy the measure to have a unique and
                # independent one per process?
                compute(block, datasets, copy.copy(hmeasure), queryengines,
                        seed=seed, iblock=iblock)
        else:
            # otherwise collect the results in an 1-item list
            _shpaldebug('Using 1 process to compute mappers.')
            if params.nblocks is None:
                params.nblocks = 1
            params.nblocks = min(len(roi_ids), params.nblocks)
            node_blocks = np.array_split(roi_ids, params.nblocks)
            p_results = [self._proc_block(block, datasets, hmeasure, queryengines)
                         for block in node_blocks]
        results_ds = self.__handle_all_results(p_results)
        # Dummy iterator for, you know, iteration
        list(results_ds)

        _shpaldebug('Wrapping projection matrices into StaticProjectionMappers')
        self.projections = [
            StaticProjectionMapper(proj=proj, recon=proj.T) if params.compute_recon
            else StaticProjectionMapper(proj=proj)
            for proj in self.projections]
        return self.projections
 def test_searchlight_hyperalignment(self):
     skip_if_no_external('scipy')
     skip_if_no_external('h5py')
     ds_orig = datasets['3dsmall'].copy()[:, :15]
     ds_orig.fa['voxel_indices'] = ds_orig.fa.myspace
     space = 'voxel_indices'
     # total number of datasets for the analysis
     nds = 5
     zscore(ds_orig, chunks_attr=None)
     dss = [ds_orig]
     # create a few distorted datasets to match the desired number of datasets
     # not sure if this truly mimics the real data, but at least we can test
     # implementation
     while len(dss) < nds - 1:
         sd = local_random_affine_transformations(
             ds_orig,
             scatter_neighborhoods(Sphere(1),
                                   ds_orig.fa[space].value,
                                   deterministic=True)[1],
             Sphere(2),
             space=space,
             scale_fac=1.0,
             shift_fac=0.0)
         # sometimes above function returns dataset with nans, infs, we don't want that.
         if np.sum(np.isnan(sd.samples)+np.isinf(sd.samples)) == 0 \
                 and np.all(sd.samples.std(0)):
             dss.append(sd)
     ds_orig_noisy = ds_orig.copy()
     ds_orig_noisy.samples += 0.1 * np.random.random(
         size=ds_orig_noisy.shape)
     dss.append(ds_orig_noisy)
     _ = [zscore(sd, chunks_attr=None) for sd in dss[1:]]
     # we should have some distortion
     for ds in dss[1:]:
         assert_false(np.all(ds_orig.samples == ds.samples))
     # testing checks
     slhyp = SearchlightHyperalignment(ref_ds=1, exclude_from_model=[1])
     self.assertRaises(ValueError, slhyp, dss[:3])
     slhyp = SearchlightHyperalignment(ref_ds=3)
     self.assertRaises(ValueError, slhyp, dss[:3])
     # explicit test of exclude_from_model
     slhyp = SearchlightHyperalignment(ref_ds=2,
                                       exclude_from_model=[1],
                                       featsel=0.7)
     projs1 = slhyp(dss)
     aligned1 = [proj.forward(ds) for proj, ds in zip(projs1, dss)]
     samples = dss[1].samples.copy()
     dss[1].samples += 0.1 * np.random.random(size=dss[1].shape)
     projs2 = slhyp(dss)
     aligned2 = [proj.forward(ds) for proj, ds in zip(projs1, dss)]
     for i in [0, 2, 3, 4]:
         assert_array_almost_equal(projs1[i].proj.todense(),
                                   projs2[i].proj.todense())
         assert_array_almost_equal(aligned1[i].samples, aligned2[i].samples)
     assert_false(
         np.all(projs1[1].proj.todense() == projs1[2].proj.todense()))
     assert_false(np.all(aligned1[1].samples == aligned2[1].samples))
     dss[1].samples = samples
     # store projections for each mapper separately
     projs = list()
     # run the algorithm with all combinations of the two major parameters
     # for projection calculation.
     for kwargs in [{
             'combine_neighbormappers': True,
             'nproc': 1 + int(externals.exists('pprocess'))
     }, {
             'combine_neighbormappers': True,
             'dtype': 'float64',
             'compute_recon': True
     }, {
             'combine_neighbormappers': True,
             'exclude_from_model': [2, 4]
     }, {
             'combine_neighbormappers': False
     }, {
             'combine_neighbormappers': False,
             'mask_node_ids': np.arange(dss[0].nfeatures).tolist()
     }, {
             'combine_neighbormappers': True,
             'sparse_radius': 1
     }, {
             'combine_neighbormappers': True,
             'nblocks': 2
     }]:
         slhyp = SearchlightHyperalignment(radius=2, **kwargs)
         mappers = slhyp(dss)
         # one mapper per input ds
         assert_equal(len(mappers), nds)
         projs.append(mappers)
     # some checks
     for midx in range(nds):
         # making sure mask_node_ids options works as expected
         assert_array_almost_equal(projs[3][midx].proj.todense(),
                                   projs[4][midx].proj.todense())
         # recon check
         assert_array_almost_equal(projs[0][midx].proj.todense(),
                                   projs[1][midx].recon.T.todense(),
                                   decimal=5)
         assert_equal(projs[1][midx].proj.dtype, 'float64')
         assert_equal(projs[0][midx].proj.dtype, 'float32')
     # making sure the projections make sense
     for proj in projs:
         # no .max on sparse matrices on older scipy (e.g. on precise) so conver to array first
         max_weight = proj[0].proj.toarray().max(1).squeeze()
         diag_weight = proj[0].proj.diagonal()
         # Check to make sure diagonal is the max weight, in almost all rows for reference subject
         assert (np.sum(max_weight == diag_weight) / float(len(diag_weight))
                 >= 0.80)
         # and not true for other subjects
         for i in range(1, nds - 1):
             assert (np.sum(proj[i].proj.toarray().max(1).squeeze() ==
                            proj[i].proj.diagonal()) /
                     float(proj[i].proj.shape[0]) < 0.80)
         # Check to make sure projection weights match across duplicate datasets
         max_weight = proj[-1].proj.toarray().max(1).squeeze()
         diag_weight = proj[-1].proj.diagonal()
         # Check to make sure diagonal is the max weight, in almost all rows for reference subject
         assert (np.sum(max_weight == diag_weight) / float(len(diag_weight))
                 >= 0.80)
     # project data
     dss_hyper = [hm.forward(sd) for hm, sd in zip(projs[0], dss)]
     _ = [zscore(sd, chunks_attr=None) for sd in dss_hyper]
     ndcss = []
     nf = ds_orig.nfeatures
     for ds_hyper in dss_hyper:
         ndcs = np.diag(np.corrcoef(ds_hyper.samples.T,
                                    ds_orig.samples.T)[nf:, :nf],
                        k=0)
         ndcss += [ndcs]
     assert_true(np.median(ndcss[0]) > 0.9)
     # noisy copy of original dataset should be similar to original after hyperalignment
     assert_true(np.median(ndcss[-1]) > 0.9)
     assert_true(np.all([np.median(ndcs) > 0.2 for ndcs in ndcss[1:-2]]))