Example #1
0
def _do_subject_smooth(subject_data,
                       fwhm,
                       prefix=None,
                       write_output_images=2,
                       func_basenames=None,
                       concat=False,
                       caching=True):
    if prefix is None:
        prefix = PREPROC_OUTPUT_IMAGE_PREFICES['smoothing']
    if func_basenames is None:
        func_basenames = [get_basenames(func) for func in subject_data.func]
    if caching:
        mem = Memory(cachedir=os.path.join(subject_data.output_dir,
                                           'cache_dir'),
                     verbose=100)
    sfunc = []
    for sess in range(subject_data.n_sessions):
        sess_func = subject_data.func[sess]
        _tmp = mem.cache(smooth_image)(sess_func, fwhm)
        if write_output_images == 2:
            _tmp = mem.cache(save_vols)(_tmp,
                                        subject_data.output_dir,
                                        basenames=func_basenames[sess],
                                        prefix=prefix,
                                        concat=concat)
        sfunc.append(_tmp)
    subject_data.func = sfunc
    return subject_data
    def _delete_orientation(self):
        """
        Delete orientation metadata. Garbage orientation metadata can lead to
        severe mis-registration trouble.

        """

        # prepare for smart caching
        if self.scratch is None:
            self.scratch = self.output_dir
        cache_dir = os.path.join(self.scratch, 'cache_dir')
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        mem = Memory(cachedir=cache_dir, verbose=5)

        # deleteorient for func
        for attr in ['n_sessions', 'session_output_dirs']:
            if getattr(self, attr) is None:
                warnings.warn("'%s' attribute of is None! Skipping" % attr)
                break
        else:
            self.func = [mem.cache(delete_orientation)(
                self.func[sess], self.session_output_dirs[sess])
                         for sess in range(self.n_sessions)]

        # deleteorient for anat
        if self.anat is not None:
            self.anat = mem.cache(delete_orientation)(
                self.anat, self.anat_output_dir)
Example #3
0
    def _delete_orientation(self):
        """
        Delete orientation metadata. Garbage orientation metadata can lead to
        severe mis-registration trouble.

        """

        # prepare for smart caching
        if self.scratch is None:
            self.scratch = self.output_dir
        if self.caching:
            cache_dir = os.path.join(self.scratch, 'cache_dir')
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir)
            mem = Memory(cachedir=cache_dir, verbose=5)
        else:
            mem = Memory(None, verbose=0)

        # deleteorient for func
        for attr in ['n_sessions', 'session_output_dirs']:
            if getattr(self, attr) is None:
                warnings.warn("'%s' attribute of is None! Skipping" % attr)
                break
        else:
            self.func = [
                mem.cache(delete_orientation)(self.func[sess],
                                              self.session_output_dirs[sess])
                for sess in range(self.n_sessions)
            ]

        # deleteorient for anat
        if self.anat is not None:
            self.anat = mem.cache(delete_orientation)(self.anat,
                                                      self.anat_output_dir)
Example #4
0
def _do_subject_slice_timing(subject_data, ref_slice=0,
                             slice_order="ascending", interleaved=False,
                             caching=True, write_output_images=2,
                             func_prefix=None, func_basenames=None,
                             ext=None, verbose=True):
    if func_prefix is None:
        func_prefix = PREPROC_OUTPUT_IMAGE_PREFICES['STC']
    if func_basenames is None:
        func_basenames = [get_basenames(func)
                          for func in subject_data.func]

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(
            subject_data.output_dir, 'cache_dir'),
            verbose=100 if verbose is True else verbose)
    else:
        mem = Memory(None)
    stc_output = []
    original_bold = subject_data.func
    for sidx, sess_func in enumerate(subject_data.func):
        fmristc = fMRISTC(slice_order=slice_order, ref_slice=ref_slice,
                          interleaved=interleaved, verbose=verbose)
        mem.cache(fmristc.fit)(raw_data=sess_func)
        stc_output.append(mem.cache(fmristc.transform)(
                sess_func,
                output_dir=subject_data.session_output_dirs[sidx] if (
                    write_output_images > 0) else None,
                basenames=func_basenames[sidx],
                prefix=func_prefix, ext=ext))
    subject_data.func = stc_output
    del original_bold, fmristc
    if write_output_images > 1:
        subject_data.hardlink_output_files(verbose=verbose)
    return subject_data
Example #5
0
    def fit(self, X, y=None):
        """
        Compute agglomerative clustering.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)

        Returns
        -------
        self
        """

        memory = self.memory
        if isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)
        if self.n_landmarks is None:
            distances = memory.cache(pdist)(X, self.metric)
        else:
            if self.landmark_strategy == 'random':
                land_indices = check_random_state(self.random_state).randint(len(X), size=self.n_landmarks)
            else:
                land_indices = np.arange(len(X))[::(len(X) // self.n_landmarks)][:self.n_landmarks]
            distances = memory.cache(pdist)(X[land_indices], self.metric)

        tree = memory.cache(linkage)(distances, method=self.linkage)
        self.landmark_labels_ = fcluster(tree, criterion='maxclust', t=self.n_clusters) - 1

        if self.n_landmarks is None:
            self.landmarks_ = X
        else:
            self.landmarks_ = X[land_indices]

        return self
Example #6
0
    def transform(self, niimgs):
        memory = self.transform_memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        # Load data (if filenames are given, load them)
        if self.verbose > 0:
            print "[%s.transform] Loading data" % self.__class__.__name__
        niimgs = utils.check_niimgs(niimgs)

        # Resampling: allows the user to change the affine, the shape or both
        if self.verbose > 0:
            print "[%s.transform] Resampling" % self.__class__.__name__
        niimgs = memory.cache(resampling.resample_img)(niimgs,
                    target_affine=self.target_affine,
                    target_shape=self.target_shape)

        # Get series from data with optional smoothing
        if self.verbose > 0:
            print "[%s.transform] Masking and smoothing" \
                % self.__class__.__name__
        data = masking.apply_mask(niimgs, self.mask_, smooth=self.smooth)

        # Temporal
        # ========
        # Detrending (optional)
        # Filtering (grab TR from header)
        # Confounds (from csv file or numpy array)
        # Normalizing

        if self.verbose > 0:
            print "[%s.transform] Cleaning signal" % self.__class__.__name__
        if self.sessions_ is None:
            data = memory.cache(signals.clean)(data,
                    confounds=self.confounds, low_pass=self.low_pass,
                    high_pass=self.high_pass, t_r=self.t_r,
                    detrend=self.detrend, normalize=False)
        else:
            for s in np.unique(self.sessions_):
                if self.confounds is not None:
                    session_confounds = self.confounds[self.sessions_ == s]
                    data[self.sessions_ == s] = \
                        memory.cache(signals.clean)(
                                data=data[self.sessions_ == s],
                                confounds=session_confounds,
                                low_pass=self.low_pass,
                                high_pass=self.high_pass, t_r=self.t_r,
                                detrend=self.detrend, normalize=False)

        # For _later_: missing value removal or imputing of missing data
        # (i.e. we want to get rid of NaNs, if smoothing must be done
        # earlier)
        # Optionally: 'doctor_nan', remove voxels with NaNs, other option
        # for later: some form of imputation

        # data is in format voxel x time_series. We inverse it
        data = np.rollaxis(data, -1)

        self.affine_ = niimgs.get_affine()
        return data
    def fit(self, X, y=None):
        """
        Compute agglomerative clustering.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)

        Returns
        -------
        self
        """

        memory = self.memory
        if isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)
        if self.n_landmarks is None:
            distances = memory.cache(pdist)(X, self.metric)
        else:
            if self.landmark_strategy == 'random':
                land_indices = check_random_state(self.random_state).randint(len(X), size=self.n_landmarks)
            else:
                land_indices = np.arange(len(X))[::(len(X)//self.n_landmarks)][:self.n_landmarks]
            distances = memory.cache(pdist)(X[land_indices], self.metric)

        tree = memory.cache(linkage)(distances, method=self.linkage)
        self.landmark_labels_ = fcluster(tree, criterion='maxclust', t=self.n_clusters) - 1

        if self.n_landmarks is None:
            self.landmarks_ = X
        else:
            self.landmarks_ = X[land_indices]

        return self
Example #8
0
def _do_subject_coregister(
        subject_data, coreg_func_to_anat=True, caching=True,
        ext=None, write_output_images=2, func_basenames=None, func_prefix="",
        anat_basename=None, anat_prefix="", report=True, verbose=True):
    ref_brain = 'func'
    src_brain = 'anat'
    ref = subject_data.func[0]
    src = subject_data.anat
    if coreg_func_to_anat:
        ref_brain, src_brain = src_brain, ref_brain
        ref, src = src, ref

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(
                subject_data.output_dir, 'cache_dir'),
                verbose=100 if verbose is True else verbose)
    else:
        mem = Memory()

    # estimate realignment (affine) params for coreg
    coreg = Coregister(verbose=verbose)
    coreg = mem.cache(coreg.fit)(ref, src)

    # apply coreg
    if coreg_func_to_anat:
        if func_basenames is None:
            func_basenames = [get_basenames(func)
                              for func in subject_data.func]
        coreg_func = []
        for sidx, sess_func in enumerate(subject_data.func):
            output_dir = subject_data.session_scratch_dirs[sidx]
            coreg_func.append(mem.cache(coreg.transform)(
                sess_func, output_dir=output_dir if (
                    write_output_images == 2) else None,
                basenames=func_basenames[sidx] if coreg_func_to_anat
                else anat_basename, prefix=func_prefix))
        subject_data.func = coreg_func
        src = load_vols(subject_data.func[0])[0]
    else:
        if anat_basename is None:
            anat_basename = get_basenames(subject_data.anat)
        subject_data.anat = mem.cache(coreg.transform)(
            subject_data.anat, basename=anat_basename,
            output_dir=subject_data.anat_scratch_output_dir if (
                write_output_images == 2) else None, prefix=anat_prefix,
            ext=ext)
        src = subject_data.anat

    # generate coregistration QA thumbs
    if report:
        subject_data.generate_coregistration_thumbnails(
            coreg_func_to_anat=coreg_func_to_anat, nipype=False)

    del coreg
    if write_output_images > 1:
        subject_data.hardlink_output_files(verbose=verbose)

    return subject_data
Example #9
0
    def _cache(self, func, memory_level=1, **kwargs):
        """ Return a joblib.Memory object if necessary.

        The memory_level determines the level above which the wrapped
        function output is cached. By specifying a numeric value for
        this level, the user can to control the amount of cache memory
        used. This function will cache the function call or not
        depending on the cache level.

        Parameters
        ----------
        func: python function
            The function which output is to be cached.

        memory_level: integer
            The memory_level from which caching must be enabled for the wrapped
            function.

        Returns
        -------
        Either the original function, if there is no need to cache it (because
        the requested level is lower than the value given to _cache()) or a
        joblib.Memory object that wraps the function func.
        """

        # Creates attributes if they don't exist
        # This is to make creating them in __init__() optional.
        if not hasattr(self, "memory_level"):
            self.memory_level = 0
        if not hasattr(self, "memory"):
            self.memory = Memory(cachedir=None)

        # If cache level is 0 but a memory object has been provided, set
        # memory_level to 1 with a warning.
        if self.memory_level == 0:
            if (isinstance(self.memory, basestring)
                    or self.memory.cachedir is not None):
                warnings.warn("memory_level is currently set to 0 but "
                              "a Memory object has been provided. "
                              "Setting memory_level to 1.")
                self.memory_level = 1

        if self.memory_level < memory_level:
            mem = Memory(cachedir=None)
            return mem.cache(func, **kwargs)
        else:
            memory = self.memory
            if isinstance(memory, basestring):
                memory = Memory(cachedir=memory)
            if not isinstance(memory, Memory):
                raise TypeError("'memory' argument must be a string or a "
                                "joblib.Memory object.")
            if memory.cachedir is None:
                warnings.warn(
                    "Caching has been enabled (memory_level = %d) but no"
                    " Memory object or path has been provided (parameter"
                    " memory). Caching deactivated for function %s." %
                    (self.memory_level, func.func_name))
            return memory.cache(func, **kwargs)
Example #10
0
    def _cache(self, func, memory_level=1, **kwargs):
        """ Return a joblib.Memory object if necessary.

        The memory_level determines the level above which the wrapped
        function output is cached. By specifying a numeric value for
        this level, the user can to control the amount of cache memory
        used. This function will cache the function call or not
        depending on the cache level.

        Parameters
        ----------
        func: python function
            The function which output is to be cached.

        memory_level: integer
            The memory_level from which caching must be enabled for the wrapped
            function.

        Returns
        -------
        Either the original function, if there is no need to cache it (because
        the requested level is lower than the value given to _cache()) or a
        joblib.Memory object that wraps the function func.
        """

        # Creates attributes if they don't exist
        # This is to make creating them in __init__() optional.
        if not hasattr(self, "memory_level"):
            self.memory_level = 0
        if not hasattr(self, "memory"):
            self.memory = Memory(cachedir=None)

        # If cache level is 0 but a memory object has been provided, set
        # memory_level to 1 with a warning.
        if self.memory_level == 0:
            if (isinstance(self.memory, basestring)
                    or self.memory.cachedir is not None):
                warnings.warn("memory_level is currently set to 0 but "
                              "a Memory object has been provided. "
                              "Setting memory_level to 1.")
                self.memory_level = 1

        if self.memory_level < memory_level:
            mem = Memory(cachedir=None)
            return mem.cache(func, **kwargs)
        else:
            memory = self.memory
            if isinstance(memory, basestring):
                memory = Memory(cachedir=memory)
            if not isinstance(memory, Memory):
                raise TypeError("'memory' argument must be a string or a "
                                "joblib.Memory object.")
            if memory.cachedir is None:
                warnings.warn("Caching has been enabled (memory_level = %d) but no"
                              " Memory object or path has been provided (parameter"
                              " memory). Caching deactivated for function %s." %
                              (self.memory_level, func.func_name))
            return memory.cache(func, **kwargs)
Example #11
0
def affine_registration_pypreprocess(in_path, ref_path, out_path, 
                                     in_ref_mat = '', ref_in_mat = '',
                                     T = None, force_resample = False,
                                     extra_params={}):
    """
    Affine registation and resampling. Use Coregister from pypreprocess. 
    
    Coregister is designed for transformation between func and anat, so applying
    this function to mni standard space may not produce the best result.
    
    inputs:
        in_path: path to the source (input) image.
        ref_path: path to the target (reference) image.
        out_path: path to use to save the registered image. 
        in_ref_mat: if bool(in_ref_mat) is True, save the 4x4 transformation
                    matrix to a text file <in_ref_mat>. 
        ref_in_mat: if bool(ref_in_mat) is True, save the reverse of the 4x4
                    transformation matrix to a text file <ref_in_mat>. 
        T: specific transformation to use. if None, T will be estimated using 
           Coregister().fit; else numpy.array(T) will be used. T is an array
           of 6 elements; the first three represent translation, and the last
           three represent rotations. 
        force_resample: bool. whether or not to resample in an extra step. 
            by default pypreprocess does not resample data, which means we have
            to use nilearn's module to do that. also scaling is not one of the
            provided DoF/estimation parameters of pypreprocess, neither did I
            implement it myself. maybe check scipy.misc.imresize if scaling 
            needs to be implemented in the future. 
        extra_params: for Coregister()
        
    """
    source = nib.load(in_path)
    target = nib.load(ref_path)
    
#    coreg = Coregister()
    coreg = AllFeatures(Coregister, extra_params).run()
    
    if T is None:
        mem = Memory("affine_registration_pypreprocess_cache")
        coreg = mem.cache(coreg.fit)(target, source) # fit(target, source)
    else:
        T_ = np.array(T)
        if T_.size != 6 or T_.dtype != float:
            raise ValueError('T should either be None or ndarray with size 6 and dtype float')
        print('using predefined T = %s' % T)
        coreg.params_ = T_
    
    img = coreg.transform(source)[0]
    if force_resample: # no rescaling here
        img = mem.cache(resample_img)(img, target.affine, target.shape)
    nib.save(img, out_path)
    if in_ref_mat:
        np.savetxt(in_ref_mat,  coreg.params_)
    if ref_in_mat:
        np.savetxt(ref_in_mat, -coreg.params_)
    
    return coreg.params_
    def fit(self, niimgs, y=None):
        """Compute the mask corresponding to the data

        Parameters
        ----------
        niimgs: list of filenames or NiImages
            Data on which the mask must be calculated. If this is a list,
            the affine is considered the same for all.
        """

        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        # Load data (if filenames are given, load them)
        if self.verbose > 0:
            print "[%s.fit] Loading data from %s" % (
                self.__class__.__name__,
                utils._repr_niimgs(niimgs)[:200])
        data = []
        for niimg in niimgs:
            # Note that data is not loaded into memory at this stage
            # if niimg is a string
            data.append(utils.check_niimgs(niimg, accept_3d=True))

        # Compute the mask if not given by the user
        if self.mask is None:
            if self.verbose > 0:
                print "[%s.fit] Computing the mask" % self.__class__.__name__
            mask = memory.cache(masking.compute_multi_epi_mask,
                                ignore=['verbose'])(
                                    niimgs,
                                    connected=self.mask_connected,
                                    opening=self.mask_opening,
                                    lower_cutoff=self.mask_lower_cutoff,
                                    upper_cutoff=self.mask_upper_cutoff,
                                    n_jobs=self.n_jobs,
                                    verbose=(self.verbose - 1))
            self.mask_img_ = Nifti1Image(mask.astype(np.int), data[0].get_affine())
        else:
            self.mask_img_ = utils.check_niimg(self.mask)

        # If resampling is requested, resample also the mask
        # Resampling: allows the user to change the affine, the shape or both
        if self.verbose > 0:
            print "[%s.transform] Resampling mask" % self.__class__.__name__
        self.mask_img_ = memory.cache(resampling.resample_img)(
            self.mask_img_,
            target_affine=self.target_affine,
            target_shape=self.target_shape,
            copy=(self.target_affine is not None and
                  self.target_shape is not None))

        return self
    def fit(self, niimgs, y=None):
        """Compute the mask corresponding to the data

        Parameters
        ----------
        niimgs: list of filenames or NiImages
            Data on which the mask must be calculated. If this is a list,
            the affine is considered the same for all.
        """

        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        # Load data (if filenames are given, load them)
        if self.verbose > 0:
            print "[%s.fit] Loading data from %s" % (
                self.__class__.__name__, utils._repr_niimgs(niimgs)[:200])
        data = []
        for niimg in niimgs:
            # Note that data is not loaded into memory at this stage
            # if niimg is a string
            data.append(utils.check_niimgs(niimg, accept_3d=True))

        # Compute the mask if not given by the user
        if self.mask is None:
            if self.verbose > 0:
                print "[%s.fit] Computing the mask" % self.__class__.__name__
            mask = memory.cache(masking.compute_multi_epi_mask,
                                ignore=['verbose'
                                        ])(niimgs,
                                           connected=self.mask_connected,
                                           opening=self.mask_opening,
                                           lower_cutoff=self.mask_lower_cutoff,
                                           upper_cutoff=self.mask_upper_cutoff,
                                           n_jobs=self.n_jobs,
                                           verbose=(self.verbose - 1))
            self.mask_img_ = Nifti1Image(mask.astype(np.int),
                                         data[0].get_affine())
        else:
            self.mask_img_ = utils.check_niimg(self.mask)

        # If resampling is requested, resample also the mask
        # Resampling: allows the user to change the affine, the shape or both
        if self.verbose > 0:
            print "[%s.transform] Resampling mask" % self.__class__.__name__
        self.mask_img_ = memory.cache(resampling.resample_img)(
            self.mask_img_,
            target_affine=self.target_affine,
            target_shape=self.target_shape,
            copy=(self.target_affine is not None
                  and self.target_shape is not None))

        return self
Example #14
0
    def _fit(self, X, y=None, **fit_params):
        self._validate_steps()
        # Setup the memory
        memory = self.memory
        if memory is None:
            memory = Memory(cachedir=None, verbose=0)
        elif isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)
        elif not isinstance(memory, Memory):
            raise ValueError("'memory' should either be a string or"
                             " a joblib.Memory instance, got"
                             " 'memory={!r}' instead.".format(memory))

        fit_transform_one_cached = memory.cache(_fit_transform_one)
        fit_sample_one_cached = memory.cache(_fit_sample_one)

        fit_params_steps = dict((name, {}) for name, step in self.steps
                                if step is not None)
        for pname, pval in six.iteritems(fit_params):
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        yt = y
        for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
            if transformer is None:
                pass
            else:
                if memory.cachedir is None:
                    # we do not clone when caching is disabled to preserve
                    # backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
                # Fit or load from cache the current transfomer
                if (hasattr(cloned_transformer, "transform") or
                        hasattr(cloned_transformer, "fit_transform")):
                    Xt, fitted_transformer = fit_transform_one_cached(
                        cloned_transformer, None, Xt, yt,
                        **fit_params_steps[name])
                elif hasattr(cloned_transformer, "sample"):
                    Xt, yt, fitted_transformer = fit_sample_one_cached(
                        cloned_transformer, Xt, yt,
                        **fit_params_steps[name])
                # Replace the transformer of the step with the fitted
                # transformer. This is necessary when loading the transformer
                # from the cache.
                self.steps[step_idx] = (name, fitted_transformer)
        if self._final_estimator is None:
            return Xt, yt, {}
        return Xt, yt, fit_params_steps[self.steps[-1][0]]
Example #15
0
    def _fit(self, X, y=None, **fit_params):
        self._validate_steps()
        # Setup the memory
        memory = self.memory
        if memory is None:
            memory = Memory(cachedir=None, verbose=0)
        elif isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)
        elif not isinstance(memory, Memory):
            raise ValueError("'memory' should either be a string or"
                             " a joblib.Memory instance, got"
                             " 'memory={!r}' instead.".format(memory))

        fit_transform_one_cached = memory.cache(_fit_transform_one)
        fit_resample_one_cached = memory.cache(_fit_resample_one)

        fit_params_steps = dict(
            (name, {}) for name, step in self.steps if step is not None)
        for pname, pval in six.iteritems(fit_params):
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        yt = y
        for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
            if transformer is None:
                pass
            else:
                if memory.cachedir is None:
                    # we do not clone when caching is disabled to preserve
                    # backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
                # Fit or load from cache the current transfomer
                if (hasattr(cloned_transformer, "transform")
                        or hasattr(cloned_transformer, "fit_transform")):
                    Xt, fitted_transformer = fit_transform_one_cached(
                        cloned_transformer, None, Xt, yt,
                        **fit_params_steps[name])
                elif hasattr(cloned_transformer, "fit_resample"):
                    Xt, yt, fitted_transformer = fit_resample_one_cached(
                        cloned_transformer, Xt, yt, **fit_params_steps[name])
                # Replace the transformer of the step with the fitted
                # transformer. This is necessary when loading the transformer
                # from the cache.
                self.steps[step_idx] = (name, fitted_transformer)
        if self._final_estimator is None:
            return Xt, yt, {}
        return Xt, yt, fit_params_steps[self.steps[-1][0]]
Example #16
0
def motion_correction_nipy(in_file, out_path, mc_alg, extra_params={}):
    """
    an attempt at motion correction using NiPy package. 
    
    inputs:
        in_file: Full path to the resting-state scan. 
        out_path: Full path to the (to be) output file. 
        mc_alg: can be either 'nipy_spacerealign' or 'nipy_spacetimerealign'
        extra_params: extra parameters to SpaceRealign, SpaceTimeRealign, estimate
    return: the motion corrected image
    """

    alg_dict = {
        'nipy_spacerealign': (SpaceRealign, {}),
        'nipy_spacetimerealign': (SpaceTimeRealign, {
            'tr': 2,
            'slice_times': 'asc_alt_2',
            'slice_info': 2
        })
    }
    # format: {'function_name':(function, kwargs), ...}

    # processing starts here
    if type(in_file) in nib.all_image_classes:
        I = nifti2nipy(in_file)  # assume Nifti1Image
    else:
        I = load_image(in_file)
    print 'source image loaded. '

    # initialize the registration algorithm
    reg = AllFeatures(alg_dict[mc_alg][0],
                      extra_params).run(I, **alg_dict[mc_alg][1])
    #    reg = alg_dict[mc_alg][0](I, **alg_dict[mc_alg][1]) # SpaceTimeRealign(I, tr=2, ...)
    print 'motion correction algorithm established. '
    print 'estimating...'

    if USE_CACHE:
        mem = Memory("func_preproc_cache_2")
        mem.cache(AllFeatures(reg.estimate, extra_params).run)(refscan=None)
#        mem.cache(reg.estimate)(refscan=None)
    else:
        AllFeatures(reg.estimate, extra_params).run(refscan=None)
#        reg.estimate(refscan=None)

    print 'estimation complete. Writing to file...'
    result = reg.resample(0)
    if out_path:
        save_image(result, out_path)
    return nipy2nifti(result)
Example #17
0
    def _niigz2nii(self):
        """
        Convert .nii.gz to .nii (crucial for SPM).

        """
        cache_dir = os.path.join(self.scratch, 'cache_dir')
        mem = Memory(cache_dir, verbose=100)
        self._sanitize_session_output_dirs()
        if not None in [self.func, self.n_sessions, self.session_output_dirs]:
            self.func = [mem.cache(do_niigz2nii)(
                self.func[sess], output_dir=self.session_output_dirs[sess])
                         for sess in range(self.n_sessions)]
        if not self.anat is None:
            self.anat = mem.cache(do_niigz2nii)(
                self.anat, output_dir=self.anat_output_dir)
    def test_multilabel(self):
        cache = Memory(cachedir=tempfile.gettempdir())
        cached_func = cache.cache(
            sklearn.datasets.make_multilabel_classification)
        X, Y = cached_func(n_samples=150,
                           n_features=20,
                           n_classes=5,
                           n_labels=2,
                           length=50,
                           allow_unlabeled=True,
                           sparse=False,
                           return_indicator=True,
                           return_distributions=False,
                           random_state=1)
        X_train = X[:100, :]
        Y_train = Y[:100, :]
        X_test = X[101:, :]
        Y_test = Y[101:, ]

        data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_test': X_test,
            'Y_test': Y_test
        }

        dataset_properties = {'multilabel': True}
        cs = SimpleClassificationPipeline(dataset_properties=dataset_properties).\
            get_hyperparameter_search_space()
        self._test_configurations(configurations_space=cs, data=data)
    def test_multilabel(self):
        cache = Memory(cachedir=tempfile.gettempdir())
        cached_func = cache.cache(
            sklearn.datasets.make_multilabel_classification
        )
        X, Y = cached_func(
            n_samples=150,
            n_features=20,
            n_classes=5,
            n_labels=2,
            length=50,
            allow_unlabeled=True,
            sparse=False,
            return_indicator=True,
            return_distributions=False,
            random_state=1
        )
        X_train = X[:100, :]
        Y_train = Y[:100, :]
        X_test = X[101:, :]
        Y_test = Y[101:, ]

        data = {'X_train': X_train, 'Y_train': Y_train,
                'X_test': X_test, 'Y_test': Y_test}

        dataset_properties = {'multilabel': True}
        cs = SimpleClassificationPipeline(dataset_properties=dataset_properties).\
            get_hyperparameter_search_space()
        self._test_configurations(configurations_space=cs, data=data)
Example #20
0
    def comput_coefs(self, X, y, size):
        cv = KFold(2)  # cross-validation generator for model selection
        ridge = BayesianRidge()
        cachedir = tempfile.mkdtemp()
        mem = Memory(cachedir=cachedir, verbose=1)

        # Ward agglomeration followed by BayesianRidge
        connectivity = grid_to_graph(n_x=size, n_y=size)
        ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity,
                                    memory=mem)
        clf = Pipeline([('ward', ward), ('ridge', ridge)])
        # Select the optimal number of parcels with grid search
        clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
        coef_agglomeration_ = coef_.reshape(size, size)

        # Anova univariate feature selection followed by BayesianRidge
        f_regression = mem.cache(feature_selection.f_regression)  # caching function
        anova = feature_selection.SelectPercentile(f_regression)
        clf = Pipeline([('anova', anova), ('ridge', ridge)])
        # Select the optimal percentage of features with grid search
        clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
        coef_selection_ = coef_.reshape(size, size)
        return dict(
            coef_selection_=coef_selection_,
            coef_agglomeration_=coef_agglomeration_,
            cachedir=cachedir
        )
def _do_subject_slice_timing(subject_data, ref_slice=0,
                             slice_order="ascending", interleaved=False,
                             caching=True, write_output_images=2,
                             func_prefix=None, func_basenames=None,
                             ext=None):
    if func_prefix is None:
        func_prefix = PREPROC_OUTPUT_IMAGE_PREFICES['STC']
    if func_basenames is None:
        func_basenames = [get_basenames(func)
                          for func in subject_data.func]

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(
            subject_data.output_dir, 'cache_dir'), verbose=100)
    runner = lambda handle: mem.cache(handle) if caching else handle
    stc_output = []
    original_bold = subject_data.func
    for sess_func, sess_id in zip(subject_data.func,
                                  range(subject_data.n_sessions)):
        fmristc = runner(fMRISTC(slice_order=slice_order, ref_slice=ref_slice,
                                 interleaved=interleaved, verbose=True).fit)(
                                raw_data=sess_func)
        stc_output.append(runner(fmristc.transform)(
                sess_func,
                output_dir=subject_data.tmp_output_dir if (
                    write_output_images > 0) else None,
                basenames=func_basenames[sess_id],
                prefix=func_prefix, ext=ext))
    subject_data.func = stc_output
    del original_bold, fmristc
    if write_output_images > 1:
        subject_data.hardlink_output_files()
    return subject_data
Example #22
0
    def fit(self, niimgs, y=None):
        """Compute the mask corresponding to the data

        Parameters
        ----------
        niimgs: list of filenames or NiImages
            Data on which the mask must be calculated. If this is a list,
            the affine is considered the same for all.
        """

        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        # Load data (if filenames are given, load them)
        if self.verbose > 0:
            print "[%s.fit] Loading data" % self.__class__.__name__
        niimgs = utils.check_niimgs(niimgs, accept_3d=True)

        # Compute the mask if not given by the user
        if self.mask is None:
            if self.verbose > 0:
                print "[%s.fit] Computing the mask" % self.__class__.__name__
            mask = memory.cache(masking.compute_epi_mask)(
                niimgs.get_data(),
                connected=self.mask_connected,
                opening=self.mask_opening,
                lower_cutoff=self.mask_lower_cutoff,
                upper_cutoff=self.mask_upper_cutoff,
                verbose=(self.verbose - 1),
            )
            self.mask_ = Nifti1Image(mask.astype(np.int), niimgs.get_affine())
        else:
            self.mask_ = utils.check_niimg(self.mask)

            # If resampling is requested, resample also the mask
            # Resampling: allows the user to change the affine, the shape or both
        if self.verbose > 0:
            print "[%s.transform] Resampling mask" % self.__class__.__name__
        self.mask_ = memory.cache(resampling.resample_img)(
            self.mask_,
            target_affine=self.target_affine,
            target_shape=self.target_shape,
            copy=(self.target_affine is not None and self.target_shape is not None),
        )

        return self
Example #23
0
def fetch_asirra(image_count=1000):
    partial_path = check_fetch_asirra()
    m = Memory(cachedir=partial_path, compress=6, verbose=0)
    load_func = m.cache(_fetch_asirra)
    images, target = load_func(partial_path, image_count=image_count)
    return Bunch(data=images.reshape(len(images), -1),
                 images=images, target=target,
                 DESCR="Asirra cats and dogs dataset")
Example #24
0
    def _niigz2nii(self):
        """
        Convert .nii.gz to .nii (crucial for SPM).

        """
        cache_dir = os.path.join(self.scratch, 'cache_dir')
        mem = Memory(cache_dir, verbose=100)
        self._sanitize_session_output_dirs()
        if not None in [self.func, self.n_sessions, self.session_output_dirs]:
            self.func = [
                mem.cache(do_niigz2nii)(
                    self.func[sess], output_dir=self.session_output_dirs[sess])
                for sess in range(self.n_sessions)
            ]
        if not self.anat is None:
            self.anat = mem.cache(do_niigz2nii)(
                self.anat, output_dir=self.anat_output_dir)
Example #25
0
 def get_multilabel(self):
     cache = Memory(cachedir=tempfile.gettempdir())
     cached_func = cache.cache(make_multilabel_classification)
     return cached_func(n_samples=100,
                        n_features=10,
                        n_classes=5,
                        n_labels=5,
                        return_indicator=True,
                        random_state=1)
def get_all_metadata(config=None, args=None):
  if config == None and args == None:
    raise Exception('Either config or args need to be not None')
  if config == None:
    config = get_config(args)
    
  class_meta  = read_class_meta(config.dataset.class_meta_file)
  attrib_meta_with_name = read_attribute_meta(config.dataset.attrib_meta_file)
  attrib_meta = attrib_meta_with_name.drop('class_name',axis=1)
  train_annos = read_image_annotations(config.dataset.train_annos_file)
  test_annos = read_image_annotations(config.dataset.test_annos_file,
                                      has_class_id=False)
  domain_meta = read_domain_meta(config.dataset.domain_meta_file)
  train_annos['class_name'] = np.array([class_meta.class_name[class_index] for 
                                         class_index in 
                                         train_annos.class_index])
#   test_annos['class_name'] = np.array([class_meta.class_name[class_index] for 
#                                          class_index in 
#                                          test_annos.class_index])

  # Prepand path to the dataset to each img_path
  train_annos.img_path = train_annos.img_path.apply(lambda x: config.dataset.main_path.joinpath(x).abspath())
  test_annos.img_path = test_annos.img_path.apply(lambda x: config.dataset.main_path.joinpath(x).abspath())

  # Filter the class meta and train/test annotations to just use the 
  # domains defined in config
  class_meta = class_meta[class_meta.domain_index.isin(config.dataset.domains)]
  train_annos = train_annos[train_annos.domain_index.isin(config.dataset.domains)]
  test_annos = test_annos[test_annos.domain_index.isin(config.dataset.domains)]
  
  
  # Create dev set
  dev_annos_train, dev_annos_test = create_dev_set(train_annos, 
                                                   config)

  # Should we use the dev set as the test set
  if config.dataset.dev_set.use:
    train_used, test_used = dev_annos_train, dev_annos_test 
  else:
    train_used, test_used = train_annos, test_annos
    
    
  if config.flip_images:
    memory = Memory(cachedir=config.cache_dir, verbose=config.logging.verbose)
    flip_func = memory.cache(create_flipped_images)
    train_used = flip_func(train_used, config)

  return ({'real_train_annos': train_annos,
           'real_test_annos': test_annos,
           'train_annos': train_used,
           'test_annos': test_used,
           'validation_annos': dev_annos_test, 
            'class_meta': class_meta,
            'domain_meta': domain_meta,
            'attrib_meta': attrib_meta,
            'attrib_meta_with_name': attrib_meta_with_name},
          config)
Example #27
0
def cache(func, memory, ref_memory_level=2, memory_level=1, **kwargs):
    """ Return a joblib.Memory object.

    The memory_level determines the level above which the wrapped
    function output is cached. By specifying a numeric value for
    this level, the user can to control the amount of cache memory
    used. This function will cache the function call or not
    depending on the cache level.

    Parameters
    ----------
    func: function
        The function which output is to be cached.

    memory: instance of joblib.Memory or string
        Used to cache the function call.

    ref_memory_level: int
        The reference memory_level used to determine if function call must
        be cached or not (if memory_level is larger than ref_memory_level
        the function is cached)

    memory_level: int
        The memory_level from which caching must be enabled for the wrapped
        function.

    kwargs: keyword arguments
        The keyword arguments passed to memory.cache

    Returns
    -------
    mem: joblib.MemorizedFunc
        object that wraps the function func. This object may be
        a no-op, if the requested level is lower than the value given
        to _cache()). For consistency, a joblib.Memory object is always
        returned.
    """

    if ref_memory_level <= memory_level or memory is None:
        memory = Memory(cachedir=None)
    else:
        memory = memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)
        if not isinstance(memory, memory_classes):
            raise TypeError("'memory' argument must be a string or a "
                            "joblib.Memory object. "
                            "%s %s was given." % (memory, type(memory)))
        if memory.cachedir is None:
            warnings.warn("Caching has been enabled (memory_level = %d) "
                          "but no Memory object or path has been provided"
                          " (parameter memory). Caching deactivated for "
                          "function %s." %
                          (ref_memory_level, func.func_name),
                          stacklevel=2)
    return memory.cache(func, **kwargs)
def _do_subject_coregister(
        subject_data, coreg_func_to_anat=True, caching=True,
        ext=None, write_output_images=2, func_basenames=None, func_prefix="",
        anat_basename=None, anat_prefix="", report=True, verbose=True):
    ref_brain = 'func'
    src_brain = 'anat'
    ref = subject_data.func[0]
    src = subject_data.anat
    if coreg_func_to_anat:
        ref_brain, src_brain = src_brain, ref_brain
        ref, src = src, ref

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(
                subject_data.output_dir, 'cache_dir'), verbose=100)
    runner = lambda handle: mem.cache(handle) if caching else handle

    # estimate realignment (affine) params for coreg
    coreg = runner(Coregister(verbose=verbose).fit)(ref, src)

    # apply coreg
    if coreg_func_to_anat:
        if func_basenames is None:
            func_basenames = [get_basenames(func)
                              for func in subject_data.func]
        coreg_func = []
        for sess_func, sess_id in zip(subject_data.func, range(
                subject_data.n_sessions)):
            coreg_func.append(runner(coreg.transform)(
                sess_func, output_dir=subject_data.tmp_output_dir if (
                    write_output_images == 2) else None,
                basenames=func_basenames[sess_id] if coreg_func_to_anat
                else anat_basename, prefix=func_prefix))
        subject_data.func = coreg_func
        src = load_vols(subject_data.func[0])[0]
    else:
        if anat_basename is None:
            anat_basename = get_basenames(subject_data.anat)
        subject_data.anat = runner(coreg.transform)(
            subject_data.anat, basename=anat_basename,
            output_dir=subject_data.tmp_output_dir if (
                write_output_images == 2) else None, prefix=anat_prefix,
            ext=ext)
        src = subject_data.anat

    # generate coregistration QA thumbs
    if report:
        subject_data.generate_coregistration_thumbnails(
            coreg_func_to_anat=coreg_func_to_anat, nipype=False)

    del coreg
    if write_output_images > 1:
        subject_data.hardlink_output_files()
    return subject_data
 def get_multilabel(self):
     cache = Memory(cachedir=tempfile.gettempdir())
     cached_func = cache.cache(make_multilabel_classification)
     return cached_func(
         n_samples=100,
         n_features=10,
         n_classes=5,
         n_labels=5,
         return_indicator=True,
         random_state=1
     )
Example #30
0
def _do_subject_realign(subject_data, reslice=True, register_to_mean=False,
                        caching=True, hardlink_output=True, ext=None,
                        func_basenames=None, write_output_images=2,
                        report=True, func_prefix=None, verbose=True):
    if register_to_mean:
        raise NotImplementedError("Feature pending...")
    if func_prefix is None:
        func_prefix = PREPROC_OUTPUT_IMAGE_PREFICES['MC']
    if func_basenames is None:
        func_basenames = [get_basenames(func)
                          for func in subject_data.func]

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(
            subject_data.output_dir, 'cache_dir'),
            verbose=100 if verbose is True else verbose)
    else:
        mem = Memory(None)
    mrimc = MRIMotionCorrection(
            n_sessions=subject_data.n_sessions, verbose=verbose)
    mrimc = mem.cache(mrimc.fit)(subject_data.func)
    mrimc_output = mem.cache(mrimc.transform)(
        reslice=reslice,
        output_dir=subject_data.scratch if (
            write_output_images == 2) else None, ext=ext,
        prefix=func_prefix, basenames=func_basenames)
    subject_data.func = mrimc_output['realigned_images']
    subject_data.realignment_parameters = mrimc_output[
        'realignment_parameters']

    # generate realignment thumbs
    if report:
        subject_data.generate_realignment_thumbnails(nipype=False)

    # garbage collection
    del mrimc

    if write_output_images > 1:
        subject_data.hardlink_output_files(verbose=verbose)
    return subject_data
Example #31
0
def get_lookalike_people():
    m = Memory(cachedir='./cache_data', compress=6, verbose=0)
    load_func = m.cache(_get_lookalike_people)

    #faces, targets, target_ids = _get_lookalike_people()
    faces, targets, target_ids = load_func()

    return Bunch(data=faces.reshape(len(faces), -1),
                 images=faces,
                 target=target_ids,
                 target_names=targets,
                 DESCR="Look Alike People Dataset")
def _do_subject_smooth(subject_data, fwhm, prefix=None,
                       write_output_images=2, func_basenames=None,
                       concat=False, caching=True):
    if prefix is None:
        prefix = PREPROC_OUTPUT_IMAGE_PREFICES['smoothing']
    if func_basenames is None:
        func_basenames = [get_basenames(func) for func in subject_data.func]
    if caching:
        mem = Memory(cachedir=os.path.join(
                subject_data.output_dir, 'cache_dir'), verbose=100)
    sfunc = []
    for sess in range(subject_data.n_sessions):
        sess_func = subject_data.func[sess]
        _tmp = mem.cache(smooth_image)(sess_func,
                                   fwhm)
        if write_output_images == 2:
            _tmp = mem.cache(save_vols)(
                _tmp, subject_data.output_dir, basenames=func_basenames[sess],
                prefix=prefix, concat=concat)
        sfunc.append(_tmp)
    subject_data.func = sfunc
    return subject_data
Example #33
0
def cache(self, func, func_memory_level, **kwargs):
    """ Return a joblib.Memory object if necessary (depends on memory_level)

    The memory_level is a rough estimator of the amount of memory necessary
    to cache a function call. By specifying a numeric value for this level,
    the user will be able to control more or less the memory used on his
    computer. This function will cache the function call or not depending
    on the memory level. This is an helper to avoid code pasting.

    Parameters
    ----------

    self: python object
        The object containing information about caching. It must have a
        memory attribute (used if caching is necessary) and an integer
        memory_level attribute to determine if the function must be cached
        or not.

    func: python function
        The function that may be cached

    func_memory_level: integer
        The memory_level from which caching must be enabled.

    Returns
    -------

    Either the original function (if there is no need to cache it) or a
    joblib.Memory object that will be used to cache the function call.
    """
    # if memory level is 0 but a memory object is provided, put memory_level
    # to 1 with a warning
    if self.memory_level == 0:
        if hasattr(self, 'memory') and self.memory is not None \
                                   and (isinstance(self.memory, basestring)
                                   or self.memory.cachedir is not None):
            warnings.warn("memory_level is set to 0 but a Memory object has"
                    " been provided. Setting memory_level to 1.")
            self.memory_level = 1
    if self.memory_level < func_memory_level:
        return func
    else:
        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)
        if memory.cachedir is None:
            warnings.warn("Caching has been enabled (memory_level = %d) but no"
                          " Memory object or path has been provided (parameter"
                          " memory). Caching canceled for function %s." %
                          (self.memory_level, func.func_name))
        return memory.cache(func, **kwargs)
Example #34
0
    def fit(self, data, Y=None):
        if hasattr(data, 'copy'):
            # It's an array
            data = data.copy()
        else:
            # Probably a list
            data = copy.deepcopy(data)

        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
            delayed(subject_pca)(
                subject_data, n_components=self.n_components, mem=memory)
            for subject_data in data)
        pcas = np.concatenate(pcas, axis=1)

        if self.kurtosis_thr is None:
            group_maps = memory.cache(randomized_svd)(pcas,
                                                      self.n_components)[0]
            group_maps = group_maps[:, :self.n_components]
            ica_maps = memory.cache(fastica)(group_maps,
                                             whiten=False,
                                             fun='cube',
                                             random_state=self.random_state)[2]
            ica_maps = ica_maps.T
        else:
            ica_maps = self._find_high_kurtosis(pcas, memory)

        del pcas
        self.maps_ = ica_maps
        if not self.maps_only:
            # Relearn the time series
            self.learn_from_maps(data)

        return self
Example #35
0
    def fit(self, data, Y=None):
        if hasattr(data, 'copy'):
            # It's an array
            data = data.copy()
        else:
            # Probably a list
            data = copy.deepcopy(data)

        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
            delayed(subject_pca)(subject_data,
                                 n_components=self.n_components, mem=memory)
            for subject_data in data)
        pcas = np.concatenate(pcas, axis=1)

        if self.kurtosis_thr is None:
            group_maps = memory.cache(randomized_svd)(
                pcas, self.n_components)[0]
            group_maps = group_maps[:, :self.n_components]
            ica_maps = memory.cache(fastica)(group_maps, whiten=False,
                                             fun='cube',
                                             random_state=self.random_state)[2]
            ica_maps = ica_maps.T
        else:
            ica_maps = self._find_high_kurtosis(pcas, memory)

        del pcas
        self.maps_ = ica_maps
        if not self.maps_only:
            # Relearn the time series
            self.learn_from_maps(data)

        return self
Example #36
0
def motion_correction_pypreprocess(in_file,
                                   out_path,
                                   force_mean_reference,
                                   extra_params={}):
    """
    an attempt at motion correction using pypreprocess package. 
    
    inputs:
        in_file: path to the input file or input file loaded as an nibabel image.  
        out_path: path to the future output file
        force_mean_reference: if evaluated True, adjust motion according to the 
                        mean image; otherwise adjust to the first volume. 
        extra_params: extra parameters to MRIMotionCorrection
    return: the motion corrected image
    """

    if force_mean_reference:  # calculate the mean and insert to the front
        print('motion correction referenced to mean!')
        in_file = math_img('np.insert(img, 0, np.mean(img, axis=-1), axis=3)',
                           img=in_file)
    else:
        print('motion correction referenced to the first slice.')

    # instantiate realigner
    if 'MRIMotionCorrection' in extra_params:
        print 'extra parameters are used for MRIMotionCorrection: %s' % extra_params[
            'MRIMotionCorrection']
        mrimc = MRIMotionCorrection(**extra_params['MRIMotionCorrection'])
    else:
        mrimc = MRIMotionCorrection()

    # fit realigner
    if USE_CACHE:
        mem = Memory("func_preproc_cache")
        mrimc = mem.cache(mrimc.fit)(in_file)
    else:
        mrimc = mrimc.fit(in_file)

    # write realigned files to disk
    result = mrimc.transform(concat=True)['realigned_images'][0]
    if force_mean_reference:  # remove the first frame, which was the mean
        result = math_img('img[...,1:]', img=result)

    if out_path:
        nib.save(result, out_path)
    return result
Example #37
0
    def fit(self, X, y=None, get_rhos=False):
        '''
        Sets up for divergence estimation "from" new data "to" X.
        Builds FLANN indices for each bag, and maybe gets within-bag distances.

        Parameters
        ----------
        X : list of arrays or :class:`skl_groups.features.Features`
            The bags to search "to".

        get_rhos : boolean, optional, default False
            Compute within-bag distances :attr:`rhos_`. These are only needed
            for some divergence functions or if do_sym is passed, and they'll
            be computed (and saved) during :meth:`transform` if they're not
            computed here.

            If you're using Jensen-Shannon divergence, a higher max_K may
            be needed once it sees the number of points in the transformed bags,
            so the computation here might be wasted.
        '''
        self.features_ = X = as_features(X, stack=True, bare=True)

        # if we're using a function that needs to pick its K vals itself,
        # then we need to set max_K here. when we transform(), might have to
        # re-do this :|
        Ks = self._get_Ks()
        _, _, _, max_K, save_all_Ks, _ = _choose_funcs(self.div_funcs, Ks,
                                                       X.dim, X.n_pts, None,
                                                       self.version)

        if max_K >= X.n_pts.min():
            msg = "asked for K = {}, but there's a bag with only {} points"
            raise ValueError(msg.format(max_K, X.n_pts.min()))

        memory = self.memory
        if isinstance(memory, string_types):
            memory = Memory(cachedir=memory, verbose=0)

        self.indices_ = id = memory.cache(_build_indices)(X,
                                                          self._flann_args())
        if get_rhos:
            self.rhos_ = _get_rhos(X, id, Ks, max_K, save_all_Ks,
                                   self.min_dist)
        elif hasattr(self, 'rhos_'):
            del self.rhos_
        return self
Example #38
0
    def fit(self, X, y=None, get_rhos=False):
        '''
        Sets up for divergence estimation "from" new data "to" X.
        Builds FLANN indices for each bag, and maybe gets within-bag distances.

        Parameters
        ----------
        X : list of arrays or :class:`skl_groups.features.Features`
            The bags to search "to".

        get_rhos : boolean, optional, default False
            Compute within-bag distances :attr:`rhos_`. These are only needed
            for some divergence functions or if do_sym is passed, and they'll
            be computed (and saved) during :meth:`transform` if they're not
            computed here.

            If you're using Jensen-Shannon divergence, a higher max_K may
            be needed once it sees the number of points in the transformed bags,
            so the computation here might be wasted.
        '''
        self.features_ = X = as_features(X, stack=True, bare=True)

        # if we're using a function that needs to pick its K vals itself,
        # then we need to set max_K here. when we transform(), might have to
        # re-do this :|
        Ks = self._get_Ks()
        _, _, _, max_K, save_all_Ks, _ = _choose_funcs(
            self.div_funcs, Ks, X.dim, X.n_pts, None, self.version)

        if max_K >= X.n_pts.min():
            msg = "asked for K = {}, but there's a bag with only {} points"
            raise ValueError(msg.format(max_K, X.n_pts.min()))

        memory = self.memory
        if isinstance(memory, string_types):
            memory = Memory(cachedir=memory, verbose=0)

        self.indices_ = id = memory.cache(_build_indices)(X, self._flann_args())
        if get_rhos:
            self.rhos_ = _get_rhos(X, id, Ks, max_K, save_all_Ks, self.min_dist)
        elif hasattr(self, 'rhos_'):
            del self.rhos_
        return self
Example #39
0
    def transform(self, X):
        r'''
        Computes the divergences from X to :attr:`features_`.

        Parameters
        ----------
        X : list of bag feature arrays or :class:`skl_groups.features.Features`
            The bags to search "from".

        Returns
        -------
        divs : array of shape ``[len(div_funcs), len(Ks), len(X), len(features_)] + ([2] if do_sym else [])``
            The divergences from X to :attr:`features_`.
            ``divs[d, k, i, j]`` is the ``div_funcs[d]`` divergence
            from ``X[i]`` to ``fetaures_[j]`` using a K of ``Ks[k]``.
            If ``do_sym``, ``divs[d, k, i, j, 0]`` is
            :math:`D_{d,k}( X_i \| \texttt{features_}_j)` and
            ``divs[d, k, i, j, 1]`` is :math:`D_{d,k}(\texttt{features_}_j \| X_i)`.
        '''
        X = as_features(X, stack=True, bare=True)
        Y = self.features_

        Ks = np.asarray(self.Ks)

        if X.dim != Y.dim:
            msg = "incompatible dimensions: fit with {}, transform with {}"
            raise ValueError(msg.format(Y.dim, X.dim))

        memory = self.memory
        if isinstance(memory, string_types):
            memory = Memory(cachedir=memory, verbose=0)

        # ignore Y_indices to avoid slow pickling of them
        # NOTE: if the indices are approximate, then might not get the same
        #       results!
        est = memory.cache(_est_divs, ignore=['n_jobs', 'Y_indices', 'Y_rhos'])
        output, self.rhos_ = est(
            X, Y, self.indices_, getattr(self, 'rhos_', None),
            self.div_funcs, Ks,
            self.do_sym, self.clamp, self.version, self.min_dist,
            self._flann_args(), self._n_jobs)
        return output
Example #40
0
    def transform(self, X):
        r'''
        Computes the divergences from X to :attr:`features_`.

        Parameters
        ----------
        X : list of bag feature arrays or :class:`skl_groups.features.Features`
            The bags to search "from".

        Returns
        -------
        divs : array of shape ``[len(div_funcs), len(Ks), len(X), len(features_)] + ([2] if do_sym else [])``
            The divergences from X to :attr:`features_`.
            ``divs[d, k, i, j]`` is the ``div_funcs[d]`` divergence
            from ``X[i]`` to ``fetaures_[j]`` using a K of ``Ks[k]``.
            If ``do_sym``, ``divs[d, k, i, j, 0]`` is
            :math:`D_{d,k}( X_i \| \texttt{features_}_j)` and
            ``divs[d, k, i, j, 1]`` is :math:`D_{d,k}(\texttt{features_}_j \| X_i)`.
        '''
        X = as_features(X, stack=True, bare=True)
        Y = self.features_

        Ks = np.asarray(self.Ks)

        if X.dim != Y.dim:
            msg = "incompatible dimensions: fit with {}, transform with {}"
            raise ValueError(msg.format(Y.dim, X.dim))

        memory = self.memory
        if isinstance(memory, string_types):
            memory = Memory(cachedir=memory, verbose=0)

        # ignore Y_indices to avoid slow pickling of them
        # NOTE: if the indices are approximate, then might not get the same
        #       results!
        est = memory.cache(_est_divs, ignore=['n_jobs', 'Y_indices', 'Y_rhos'])
        output, self.rhos_ = est(X, Y, self.indices_,
                                 getattr(self, 'rhos_', None), self.div_funcs,
                                 Ks, self.do_sym,
                                 self.clamp, self.version, self.min_dist,
                                 self._flann_args(), self._n_jobs)
        return output
Example #41
0
def _do_subject_slice_timing(subject_data,
                             ref_slice=0,
                             slice_order="ascending",
                             interleaved=False,
                             caching=True,
                             write_output_images=2,
                             func_prefix=None,
                             func_basenames=None,
                             ext=None):
    if func_prefix is None:
        func_prefix = PREPROC_OUTPUT_IMAGE_PREFICES['STC']
    if func_basenames is None:
        func_basenames = [get_basenames(func) for func in subject_data.func]

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(subject_data.output_dir,
                                           'cache_dir'),
                     verbose=100)
    runner = lambda handle: mem.cache(handle) if caching else handle
    stc_output = []
    original_bold = subject_data.func
    for sess_func, sess_id in zip(subject_data.func,
                                  range(subject_data.n_sessions)):
        fmristc = runner(
            fMRISTC(slice_order=slice_order,
                    ref_slice=ref_slice,
                    interleaved=interleaved,
                    verbose=True).fit)(raw_data=sess_func)
        stc_output.append(
            runner(fmristc.transform)(sess_func,
                                      output_dir=subject_data.tmp_output_dir if
                                      (write_output_images > 0) else None,
                                      basenames=func_basenames[sess_id],
                                      prefix=func_prefix,
                                      ext=ext))
    subject_data.func = stc_output
    del original_bold, fmristc
    if write_output_images > 1:
        subject_data.hardlink_output_files()
    return subject_data
def _do_subject_realign(subject_data, reslice=True, register_to_mean=False,
                        caching=True, hardlink_output=True, ext=None,
                        func_basenames=None, write_output_images=2,
                        report=True, func_prefix=None):
    if register_to_mean:
        raise NotImplementedError("Feature pending...")
    if func_prefix is None:
        func_prefix = PREPROC_OUTPUT_IMAGE_PREFICES['MC']
    if func_basenames is None:
        func_basenames = [get_basenames(func)
                          for func in subject_data.func]

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(
            subject_data.output_dir, 'cache_dir'), verbose=100)
    runner = lambda handle: mem.cache(handle) if caching else handle
    mrimc = runner(MRIMotionCorrection(
            n_sessions=subject_data.n_sessions, verbose=True).fit)(
        [sess_func for sess_func in subject_data.func])
    mrimc_output = runner(mrimc.transform)(
        reslice=reslice,
        output_dir=subject_data.tmp_output_dir if (
            write_output_images == 2) else None, ext=ext,
        prefix=func_prefix, basenames=func_basenames)
    subject_data.func = mrimc_output['realigned_images']
    subject_data.realignment_parameters = mrimc_output[
        'realignment_parameters']

    # generate realignment thumbs
    if report:
        subject_data.generate_realignment_thumbnails(nipype=False)

    # garbage collection
    del mrimc

    if write_output_images > 1:
        subject_data.hardlink_output_files()

    return subject_data
Example #43
0
def fetch_asirra(image_count=1000):
    """

    Parameters
    ----------
    image_count : positive integer

    Returns
    -------
    data : Bunch
        Dictionary-like object with the following attributes :
        'images', the sample images, 'data', the flattened images,
        'target', the label for the image (0 for cat, 1 for dog),
        and 'DESCR' the full description of the dataset.
    """
    partial_path = check_fetch_asirra()
    m = Memory(cachedir=partial_path, compress=6, verbose=0)
    load_func = m.cache(_fetch_asirra)
    images, target = load_func(partial_path, image_count=image_count)
    return Bunch(data=images.reshape(len(images), -1),
                 images=images, target=target,
                 DESCR="Asirra cats and dogs dataset")
Example #44
0
def fetch_asirra(image_count=1000):
    """

    Parameters
    ----------
    image_count : positive integer

    Returns
    -------
    data : Bunch
        Dictionary-like object with the following attributes :
        'images', the sample images, 'data', the flattened images,
        'target', the label for the image (0 for cat, 1 for dog),
        and 'DESCR' the full description of the dataset.
    """
    partial_path = check_fetch_asirra()
    m = Memory(cachedir=partial_path, compress=6, verbose=0)
    load_func = m.cache(_fetch_asirra)
    images, target = load_func(partial_path, image_count=image_count)
    return Bunch(data=images.reshape(len(images), -1),
                 images=images,
                 target=target,
                 DESCR="Asirra cats and dogs dataset")
Example #45
0
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
            metric='minkowski', p=2, leaf_size=40,
            algorithm='best', memory=Memory(cachedir=None, verbose=0),
            approx_min_span_tree=True, gen_min_span_tree=False,
            core_dist_n_jobs=4, allow_single_cluster=False, **kwargs):

    """Perform HDBSCAN clustering from a vector array or distance matrix.
    
    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.
        
    min_cluster_size : int optional
        The minimum number of samples in a group for that group to be
        considered a cluster; groupings smaller than this size will be left
        as noise.

    min_samples : int, optional
        The number of samples in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
        defaults to the min_cluster_size.

    alpha : float, optional
        A distance scaling parameter as used in robust single linkage.
        See (K. Chaudhuri and S. Dasgupta  "Rates of convergence
        for the cluster tree."). (default 1.0)

    metric : string, or callable, optional
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.
        (default minkowski)

    p : int, optional
        p value to use if using the minkowski metric. (default 2)

    leaf_size : int, optional
        Leaf size for trees responsible for fast nearest
        neighbour queries. (default 40)

    algorithm : string, optional
        Exactly which algorithm to use; hdbscan has variants specialised 
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of 
        the data. You can force other options if you believe you know 
        better. Options are:
            * ``best``
            * ``generic``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    memory : Instance of joblib.Memory or string (optional)
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    approx_min_span_tree : Bool, optional
        Whether to accept an only approximate minimum spanning tree.
        For some algorithms this can provide a significant speedup, but
        the resulting clustering may be of marginally lower quality.
        If you are willing to sacrifice speed for correctness you may want
        to explore this; in general this should be left at the default True.
        (default True)

    gen_min_span_tree : bool, optional
        Whether to generate the minimum spanning tree for later analysis.
        (default False)

    core_dist_n_jobs : int, optional
        Number of parallel jobs to run in core distance computations (if
        supported by the specific algorithm).
        (default 4)


    allow_single_cluster : boolean
        By default HDBSCAN* will not produce a single cluster, setting this
        to t=True will override this and allow single cluster results in
        the case that you feel this is a valid result for your dataset.
        (default False)


    **kwargs : optional
        Arguments passed to the distance metric

    Returns
    -------
    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.

    probabilities : array [n_samples]
        Cluster membership strengths for each point. Noisy samples are assigned
        0.

    cluster_persistence : array, shape = [n_clusters]
        A score of how persistent each cluster is. A score of 1.0 represents
        a perfectly stable cluster that persists over all distance scales,
        while a score of 0.0 represents a perfectly ephemeral cluster. These
        scores can be guage the relative coherence of the clusters output
        by the algorithm.

    condensed_tree : record array
        The condensed cluster hierarchy used to generate clusters.

    single_linkage_tree : array [n_samples - 1, 4]
        The single linkage tree produced during clustering in scipy
        hierarchical clustering format
        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

    min_spanning_tree : array [n_samples - 1, 3]
        The minimum spanning as an edgelist. If gen_min_span_tree was False
        this will be None.

    References
    ----------
    R. Campello, D. Moulavi, and J. Sander, "Density-Based Clustering Based on
    Hierarchical Density Estimates"
    In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172.
    2013
    """
    if min_samples is None:
        min_samples = min_cluster_size

    if type(min_samples) is not int or type(min_cluster_size) is not int:
        raise ValueError('Min samples and min cluster size must be integers!')

    if min_samples <= 0 or min_cluster_size <= 0:
        raise ValueError('Min samples and Min cluster size must be positive integers')

    if alpha <= 0.0 or type(alpha) is int:
        raise ValueError('Alpha must be a positive value greater than 0!')

    if leaf_size < 1:
        raise ValueError('Leaf size must be greater than 0!')

    # Checks input and converts to an nd-array where possible
    X = check_array(X, accept_sparse='csr')
    # Python 2 and 3 compliant string_type checking
    if isinstance(memory, six.string_types):
        memory = Memory(cachedir=memory, verbose=0)

    if algorithm != 'best':
        if algorithm == 'generic':
            (single_linkage_tree, 
             result_min_span_tree) = \
                memory.cache(_hdbscan_generic)(X, min_samples, alpha, metric,
                                               p, leaf_size, gen_min_span_tree, **kwargs)
        elif algorithm == 'prims_kdtree':
            if metric not in KDTree.valid_metrics:
                raise ValueError("Cannot use Prim's with KDTree for this metric!")
            (single_linkage_tree, 
             result_min_span_tree) = \
                memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
                                                    metric, p, leaf_size,
                                                    gen_min_span_tree, **kwargs)
        elif algorithm == 'prims_balltree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Prim's with BallTree for this metric!")
            (single_linkage_tree, 
             result_min_span_tree) = \
                memory.cache(_hdbscan_prims_balltree)(X, min_samples, alpha,
                                                      metric, p, leaf_size,
                                                      gen_min_span_tree, **kwargs)
        elif algorithm == 'boruvka_kdtree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Boruvka with KDTree for this metric!")
            (single_linkage_tree,
             result_min_span_tree) = \
                memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha,
                                                      metric, p, leaf_size,
                                                      approx_min_span_tree,
                                                      gen_min_span_tree,
                                                      core_dist_n_jobs, **kwargs)
        elif algorithm == 'boruvka_balltree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Boruvka with BallTree for this metric!")
            (single_linkage_tree,
             result_min_span_tree) = \
                memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha,
                                                        metric, p, leaf_size,
                                                        approx_min_span_tree,
                                                        gen_min_span_tree,
                                                        core_dist_n_jobs, **kwargs)
        else:
            raise TypeError('Unknown algorithm type %s specified' % algorithm)
    else:

        if issparse(X) or metric not in FAST_METRICS:  # We can't do much with sparse matrices ...
            (single_linkage_tree,
             result_min_span_tree) = \
                memory.cache(_hdbscan_generic)(X, min_samples,
                                               alpha, metric, p, leaf_size,
                                               gen_min_span_tree, **kwargs)
        elif metric in KDTree.valid_metrics:
            #TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 60:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
                                                        metric, p, leaf_size,
                                                        gen_min_span_tree, **kwargs)
            else:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha,
                                                          metric, p, leaf_size,
                                                          approx_min_span_tree,
                                                          gen_min_span_tree,
                                                          core_dist_n_jobs, **kwargs)
        else:  # Metric is a valid BallTree metric
            # TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 60:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
                                                        metric, p, leaf_size,
                                                        gen_min_span_tree, **kwargs)
            else:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha,
                                                            metric, p, leaf_size,
                                                            approx_min_span_tree,
                                                            gen_min_span_tree,
                                                            core_dist_n_jobs, **kwargs)

    return _tree_to_labels(X,
                           single_linkage_tree,
                           min_cluster_size,
                           allow_single_cluster) + (result_min_span_tree,)
                                    t_r=2.5,
                                    standardize=True,
                                    memory='nilearn_cache',
                                    memory_level=1,
                                    verbose=2)
masker.fit()

subject_time_series = []
func_filenames = adhd_dataset.func
confound_filenames = adhd_dataset.confounds
for func_filename, confound_filename in zip(func_filenames,
                                            confound_filenames):
    print("Processing file %s" % func_filename)

    # Computing some confounds
    hv_confounds = mem.cache(image.high_variance_confounds)(func_filename)

    region_ts = masker.transform(func_filename,
                                 confounds=[hv_confounds, confound_filename])
    subject_time_series.append(region_ts)

##############################################################################
# Computing group-sparse precision matrices
from nilearn.connectome import GroupSparseCovarianceCV
gsc = GroupSparseCovarianceCV(verbose=2)
gsc.fit(subject_time_series)

from sklearn import covariance
gl = covariance.GraphLassoCV(verbose=2)
gl.fit(np.concatenate(subject_time_series))
def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
                          gamma=5, metric='minkowski', p=2, algorithm='best',
                          memory=Memory(cachedir=None, verbose=0)):
    """Perform robust single linkage clustering from a vector array
    or distance matrix.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    cut : float
        The reachability distance value to cut the cluster heirarchy at
        to derive a flat cluster labelling.

    k : int, optional
        Reachability distances will be computed with regard to the `k`
        nearest neighbors. (default 5)

    alpha : float, optional
        Distance scaling for reachability distance computation. Reachability
        distance is computed as $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
        (default sqrt(2))

    gamma : int, optional
        Ignore any clusters in the flat clustering with size less than gamma,
        and declare points in such clusters as noise points. (default 5)

    metric : string, or callable, optional
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.

    algorithm : string, optional
        Exactly which algorithm to use; hdbscan has variants specialised
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of
        the data. You can force other options if you believe you know
        better. Options are:
            * ``generic``
            * ``best``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    memory : Instance of joblib.Memory or string (optional)
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    Returns
    -------
    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.

    single_linkage_tree : array [n_samples - 1, 4]
        The single linkage tree produced during clustering in scipy
        hierarchical clustering format
        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

    References
    ----------
    K. Chaudhuri and S. Dasgupta.
    "Rates of convergence for the cluster tree."
    In Advances in Neural Information Processing Systems, 2010.

    """

    if type(k) is not int or k < 1:
        raise ValueError('k must be an integer greater than zero!')

    if type(alpha) is not float or alpha < 1.0:
        raise ValueError('alpha must be a float greater than or equal to 1.0!')

    if type(gamma) is not int or gamma < 1:
        raise ValueError('gamma must be an integer greater than zero!')

    X = check_array(X, accept_sparse='csr')
    if isinstance(memory, six.string_types):
        memory = Memory(cachedir=memory, verbose=0)

    if algorithm != 'best':
        if algorithm == 'generic':
            single_linkage_tree = \
                memory.cache(_rsl_generic)(X, k, alpha, metric, p)
        elif algorithm == 'prims_kdtree':
            single_linkage_tree = \
                memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p)
        elif algorithm == 'prims_balltree':
            single_linkage_tree = \
                memory.cache(_rsl_prims_balltree)(X, k, alpha, metric, p)
        elif algorithm == 'boruvka_kdtree':
            single_linkage_tree = \
                memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p)
        elif algorithm == 'boruvka_balltree':
            single_linkage_tree = \
                memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p)
        else:
            raise TypeError('Unknown algorithm type %s specified' % algorithm)
    else:
        if issparse(X) or metric not in FAST_METRICS:  # We can't do much with sparse matrices ...
            single_linkage_tree = \
                memory.cache(_rsl_generic)(X, k, alpha, metric, p)
        elif metric in KDTree.valid_metrics:
            # Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 128:
                single_linkage_tree = \
                    memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p)
            else:
                single_linkage_tree = \
                    memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p)
        else:  # Metric is a valid BallTree metric
            # Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 128:
                single_linkage_tree = \
                    memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p)
            else:
                single_linkage_tree = \
                    memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree
import nilearn.input_data

from sklearn.externals.joblib import Memory
mem = Memory('nilearn_cache')

masker = nilearn.input_data.NiftiMapsMasker(
    msdl_atlas_dataset.maps, resampling_target="maps", detrend=True,
    low_pass=None, high_pass=0.01, t_r=2.5, standardize=True,
    memory=mem, memory_level=1, verbose=2)
masker.fit()

fmri_filename = adhd_dataset.func[0]
confound_filename = adhd_dataset.confounds[0]

# Computing some confounds
hv_confounds = mem.cache(nilearn.image.high_variance_confounds)(
    fmri_filename)

time_series = masker.transform(fmri_filename,
                                confounds=[hv_confounds, confound_filename])


print("-- Computing graph-lasso inverse matrix ...")
from sklearn import covariance
gl = covariance.GraphLassoCV(verbose=2)
gl.fit(time_series)

# Displaying results ##########################################################
atlas_imgs = image.iter_img(msdl_atlas_dataset.maps)
atlas_region_coords = [plotting.find_xyz_cut_coords(img) for img in atlas_imgs]

title = "GraphLasso"
def _do_fmri_distortion_correction(
        subject_data,
        # i'm unsure of the readout time,
        # but this is constant across both PE
        # directions and so can be scaled to 1
        # (or any other nonzero float)
        protocol="MOTOR",
        readout_time=.01392,
        realign=True,
        coregister=True,
        coreg_func_to_anat=True,
        dc=True,
        segment=False,
        normalize=False,
        func_write_voxel_sizes=None,
        anat_write_voxel_sizes=None,
        report=False,
        **kwargs):
    """
    Function to undistort task fMRI data for a given HCP subject.

    """

    directions = ['LR', 'RL']

    subject_data.sanitize()

    if dc:
        acq_params = [[1, 0, 0, readout_time], [-1, 0, 0, readout_time]]
        acq_params_file = os.path.join(subject_data.output_dir,
                                       "b0_acquisition_params.txt")
        np.savetxt(acq_params_file, acq_params, fmt='%f')

        fieldmap_files = [
            os.path.join(
                os.path.dirname(subject_data.func[sess]),
                "%s_3T_SpinEchoFieldMap_%s.nii.gz" %
                (subject_data.subject_id, directions[sess]))
            for sess in xrange(subject_data.n_sessions)
        ]
        sbref_files = [
            sess_func.replace(".nii", "_SBRef.nii")
            for sess_func in subject_data.func
        ]

        # prepare for smart caching
        mem = Memory(os.path.join(subject_data.output_dir, "cache_dir"))

        for x in [fieldmap_files, sbref_files, subject_data.func]:
            assert len(x) == 2
            for y in x:
                assert os.path.isfile(y), y

        # fslroi
        zeroth_fieldmap_files = []
        for fieldmap_file in fieldmap_files:
            if not os.path.isfile(fieldmap_file):
                print "Can't find fieldmap file %s; skipping subject %s" % (
                    fieldmap_file, subject_data.subject_id)
                return

            # peel 0th volume of each fieldmap
            zeroth_fieldmap_file = os.path.join(
                subject_data.output_dir,
                "0th_%s" % os.path.basename(fieldmap_file))
            fslroi_cmd = "fsl5.0-fslroi %s %s 0 1" % (fieldmap_file,
                                                      zeroth_fieldmap_file)
            print "\r\nExecuting '%s' ..." % fslroi_cmd
            print mem.cache(commands.getoutput)(fslroi_cmd)

            zeroth_fieldmap_files.append(zeroth_fieldmap_file)

        # merge the 0th volume of both fieldmaps
        merged_zeroth_fieldmap_file = os.path.join(
            subject_data.output_dir, "merged_with_other_direction_%s" %
            (os.path.basename(zeroth_fieldmap_files[0])))
        fslmerge_cmd = "fsl5.0-fslmerge -t %s %s %s" % (
            merged_zeroth_fieldmap_file, zeroth_fieldmap_files[0],
            zeroth_fieldmap_files[1])
        print "\r\nExecuting '%s' ..." % fslmerge_cmd
        print mem.cache(commands.getoutput)(fslmerge_cmd)

        # do topup (learn distortion model)
        topup_results_basename = os.path.join(subject_data.output_dir,
                                              "topup_results")
        topup_cmd = ("fsl5.0-topup --imain=%s --datain=%s --config=b02b0.cnf "
                     "--out=%s" % (merged_zeroth_fieldmap_file,
                                   acq_params_file, topup_results_basename))
        print "\r\nExecuting '%s' ..." % topup_cmd
        print mem.cache(commands.getoutput)(topup_cmd)

        # apply learn deformations to absorb distortion
        dc_fmri_files = []

        for sess in xrange(2):
            # merge SBRef + task BOLD for current PE direction
            assert len(subject_data.func) == 2, subject_data
            fourD_plus_sbref = os.path.join(
                subject_data.output_dir,
                "sbref_plus_" + os.path.basename(subject_data.func[sess]))
            fslmerge_cmd = "fsl5.0-fslmerge -t %s %s %s" % (
                fourD_plus_sbref, sbref_files[sess], subject_data.func[sess])
            print "\r\nExecuting '%s' ..." % fslmerge_cmd
            print mem.cache(commands.getoutput)(fslmerge_cmd)

            # realign task BOLD to SBRef
            sess_output_dir = subject_data.session_output_dirs[sess]
            rfourD_plus_sbref = _do_subject_realign(SubjectData(
                func=[fourD_plus_sbref],
                output_dir=subject_data.output_dir,
                n_sessions=1,
                session_output_dirs=[sess_output_dir]),
                                                    report=False).func[0]

            # apply topup to realigned images
            dc_rfourD_plus_sbref = os.path.join(
                subject_data.output_dir,
                "dc" + os.path.basename(rfourD_plus_sbref))
            applytopup_cmd = (
                "fsl5.0-applytopup --imain=%s --verbose --inindex=%i "
                "--topup=%s --out=%s --datain=%s --method=jac" %
                (rfourD_plus_sbref, sess + 1, topup_results_basename,
                 dc_rfourD_plus_sbref, acq_params_file))
            print "\r\nExecuting '%s' ..." % applytopup_cmd
            print mem.cache(commands.getoutput)(applytopup_cmd)

            # recover undistorted task BOLD
            dc_rfmri_file = dc_rfourD_plus_sbref.replace("sbref_plus_", "")
            fslroi_cmd = "fsl5.0-fslroi %s %s 1 -1" % (dc_rfourD_plus_sbref,
                                                       dc_rfmri_file)
            print "\r\nExecuting '%s' ..." % fslroi_cmd
            print mem.cache(commands.getoutput)(fslroi_cmd)

            # sanity tricks
            if dc_rfmri_file.endswith(".nii"):
                dc_rfmri_file = dc_rfmri_file + ".gz"

            dc_fmri_files.append(dc_rfmri_file)

        subject_data.func = dc_fmri_files
        if isinstance(subject_data.func, basestring):
            subject_data.func = [subject_data.func]

    # continue preprocessing
    subject_data = do_subject_preproc(
        subject_data,
        realign=realign,
        coregister=coregister,
        coreg_anat_to_func=not coreg_func_to_anat,
        segment=True,
        normalize=False,
        report=report)

    # ok for GLM now
    return subject_data
def run_suject_level1_glm(
        subject_data,
        readout_time=.01392,  # seconds
        tr=.72,
        dc=True,
        hrf_model="Canonical with Derivative",
        drift_model="Cosine",
        hfcut=100,
        regress_motion=True,
        slicer='ortho',
        cut_coords=None,
        threshold=3.,
        cluster_th=15,
        normalize=True,
        fwhm=0.,
        protocol="MOTOR",
        func_write_voxel_sizes=None,
        anat_write_voxel_sizes=None,
        **other_preproc_kwargs):
    """
    Function to do preproc + analysis for a single HCP subject (task fMRI)

    """

    add_regs_files = None
    n_motion_regressions = 6
    subject_data.n_sessions = 2

    subject_data.tmp_output_dir = os.path.join(subject_data.output_dir, "tmp")
    if not os.path.exists(subject_data.tmp_output_dir):
        os.makedirs(subject_data.tmp_output_dir)

    if not os.path.exists(subject_data.output_dir):
        os.makedirs(subject_data.output_dir)

    mem = Memory(os.path.join(subject_data.output_dir, "cache_dir"),
                 verbose=100)

    # glob design files (.fsf)
    subject_data.design_files = [
        os.path.join(subject_data.data_dir,
                     ("MNINonLinear/Results/tfMRI_%s_%s/"
                      "tfMRI_%s_%s_hp200_s4_level1.fsf") %
                     (protocol, direction, protocol, direction))
        for direction in ['LR', 'RL']
    ]

    assert len(subject_data.design_files) == 2
    for df in subject_data.design_files:
        if not os.path.isfile(df):
            return

    if 0x0:
        subject_data = _do_fmri_distortion_correction(
            subject_data,
            dc=dc,
            fwhm=fwhm,
            readout_time=readout_time,
            **other_preproc_kwargs)

    # chronometry
    stats_start_time = pretty_time()

    # merged lists
    paradigms = []
    frametimes_list = []
    design_matrices = []
    # fmri_files = []
    n_scans = []
    # for direction, direction_index in zip(['LR', 'RL'], xrange(2)):
    for sess in xrange(subject_data.n_sessions):
        direction = ['LR', 'RL'][sess]
        # glob the design file
        # design_file = os.path.join(# _subject_data_dir, "tfMRI_%s_%s" % (
        # protocol, direction),
        design_file = subject_data.design_files[sess]
        #                    "tfMRI_%s_%s_hp200_s4_level1.fsf" % (
        # protocol, direction))
        if not os.path.isfile(design_file):
            print "Can't find design file %s; skipping subject %s" % (
                design_file, subject_data.subject_id)
            return

        # read the experimental setup
        print "Reading experimental setup from %s ..." % design_file
        fsl_condition_ids, timing_files, fsl_contrast_ids, contrast_values = \
            read_fsl_design_file(design_file)
        print "... done.\r\n"

        # fix timing filenames
        timing_files = [
            tf.replace("EVs", "tfMRI_%s_%s/EVs" % (protocol, direction))
            for tf in timing_files
        ]

        # make design matrix
        print "Constructing design matrix for direction %s ..." % direction
        _n_scans = nibabel.load(subject_data.func[sess]).shape[-1]
        n_scans.append(_n_scans)
        add_regs_file = add_regs_files[
            sess] if not add_regs_files is None else None
        design_matrix, paradigm, frametimes = make_dmtx_from_timing_files(
            timing_files,
            fsl_condition_ids,
            n_scans=_n_scans,
            tr=tr,
            hrf_model=hrf_model,
            drift_model=drift_model,
            hfcut=hfcut,
            add_regs_file=add_regs_file,
            add_reg_names=[
                'Translation along x axis', 'Translation along yaxis',
                'Translation along z axis', 'Rotation along x axis',
                'Rotation along y axis', 'Rotation along z axis',
                'Differential Translation along x axis',
                'Differential Translation along yaxis',
                'Differential Translation along z axis',
                'Differential Rotation along x axis',
                'Differential Rotation along y axis',
                'Differential Rotation along z axis'
            ][:n_motion_regressions] if not add_regs_files is None else None,
        )

        print "... done."
        paradigms.append(paradigm)
        frametimes_list.append(frametimes)
        design_matrices.append(design_matrix)

        # convert contrasts to dict
        contrasts = dict((
            contrast_id,
            # append zeros to end of contrast to match design
            np.hstack((
                contrast_value,
                np.zeros(len(design_matrix.names) - len(contrast_value)))))
                         for contrast_id, contrast_value in zip(
                             fsl_contrast_ids, contrast_values))

        # more interesting contrasts
        if protocol == 'MOTOR':
            contrasts['RH-LH'] = contrasts['RH'] - contrasts['LH']
            contrasts['LH-RH'] = -contrasts['RH-LH']
            contrasts['RF-LF'] = contrasts['RF'] - contrasts['LF']
            contrasts['LF-RF'] = -contrasts['RF-LF']
            contrasts['H'] = contrasts['RH'] + contrasts['LH']
            contrasts['F'] = contrasts['RF'] + contrasts['LF']
            contrasts['H-F'] = contrasts['RH'] + contrasts['LH'] - (
                contrasts['RF'] - contrasts['LF'])
            contrasts['F-H'] = -contrasts['H-F']

        contrasts = dict((k, v) for k, v in contrasts.iteritems() if "-" in k)

    # replicate contrasts across sessions
    contrasts = dict((cid, [cval] * 2) for cid, cval in contrasts.iteritems())

    cache_dir = cache_dir = os.path.join(subject_data.output_dir, 'cache_dir')
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    nipype_mem = NipypeMemory(base_dir=cache_dir)

    if 0x0:
        if np.sum(fwhm) > 0.:
            subject_data.func = nipype_mem.cache(spm.Smooth)(
                in_files=subject_data.func,
                fwhm=fwhm,
                ignore_exception=False,
            ).outputs.smoothed_files

    # fit GLM
    def tortoise(*args):
        print args
        print(
            'Fitting a "Fixed Effect" GLM for merging LR and RL '
            'phase-encoding directions for subject %s ...' %
            (subject_data.subject_id))
        fmri_glm = FMRILinearModel(
            subject_data.func,
            [design_matrix.matrix for design_matrix in design_matrices],
            mask='compute')
        fmri_glm.fit(do_scaling=True, model='ar1')
        print "... done.\r\n"

        # save computed mask
        mask_path = os.path.join(subject_data.output_dir, "mask.nii")
        print "Saving mask image to %s ..." % mask_path
        nibabel.save(fmri_glm.mask, mask_path)
        print "... done.\r\n"

        z_maps = {}
        effects_maps = {}
        map_dirs = {}
        try:
            for contrast_id, contrast_val in contrasts.iteritems():
                print "\tcontrast id: %s" % contrast_id
                z_map, eff_map = fmri_glm.contrast(contrast_val,
                                                   con_id=contrast_id,
                                                   output_z=True,
                                                   output_effects=True)

                # store stat maps to disk
                for map_type, out_map in zip(['z', 'effects'],
                                             [z_map, eff_map]):
                    map_dir = os.path.join(subject_data.output_dir,
                                           '%s_maps' % map_type)
                    map_dirs[map_type] = map_dir
                    if not os.path.exists(map_dir):
                        os.makedirs(map_dir)
                    map_path = os.path.join(
                        map_dir, '%s_%s.nii' % (map_type, contrast_id))
                    print "\t\tWriting %s ..." % map_path

                    nibabel.save(out_map, map_path)

                    # collect zmaps for contrasts we're interested in
                    if map_type == 'z':
                        z_maps[contrast_id] = map_path

                    if map_type == 'effects':
                        effects_maps[contrast_id] = map_path

            return effects_maps, z_maps, mask_path, map_dirs
        except:
            return None

    # compute native-space maps and mask
    stuff = mem.cache(tortoise)(subject_data.func, subject_data.anat)
    if stuff is None:
        return None
    effects_maps, z_maps, mask_path, map_dirs = stuff

    # remove repeated contrasts
    contrasts = dict((cid, cval[0]) for cid, cval in contrasts.iteritems())
    import json
    json.dump(
        dict((k, list(v)) for k, v in contrasts.iteritems()),
        open(os.path.join(subject_data.tmp_output_dir, "contrasts.json"), "w"))
    subject_data.contrasts = contrasts

    if normalize:
        assert hasattr(subject_data, "parameter_file")

        subject_data.native_effects_maps = effects_maps
        subject_data.native_z_maps = z_maps
        subject_data.native_mask_path = mask_path

        # warp effects maps and mask from native to standard space (MNI)
        apply_to_files = [
            v for _, v in subject_data.native_effects_maps.iteritems()
        ] + [subject_data.native_mask_path]
        tmp = nipype_mem.cache(spm.Normalize)(
            parameter_file=getattr(subject_data, "parameter_file"),
            apply_to_files=apply_to_files,
            write_bounding_box=[[-78, -112, -50], [78, 76, 85]],
            write_voxel_sizes=func_write_voxel_sizes,
            write_wrap=[0, 0, 0],
            write_interp=1,
            jobtype='write',
            ignore_exception=False,
        ).outputs.normalized_files

        subject_data.mask = hard_link(tmp[-1], subject_data.output_dir)
        subject_data.effects_maps = dict(
            zip(effects_maps.keys(), hard_link(tmp[:-1], map_dirs["effects"])))

        # warp anat image
        subject_data.anat = hard_link(
            nipype_mem.cache(spm.Normalize)(
                parameter_file=getattr(subject_data, "parameter_file"),
                apply_to_files=subject_data.anat,
                write_bounding_box=[[-78, -112, -50], [78, 76, 85]],
                write_voxel_sizes=anat_write_voxel_sizes,
                write_wrap=[0, 0, 0],
                write_interp=1,
                jobtype='write',
                ignore_exception=False,
            ).outputs.normalized_files, subject_data.anat_output_dir)
    else:
        subject_data.mask = mask_path
        subject_data.effects_maps = effects_maps
        subject_data.z_maps = z_maps

    return subject_data
masker = input_data.NiftiMapsMasker(
    msdl_atlas_dataset.maps, resampling_target="maps", detrend=True,
    low_pass=None, high_pass=0.01, t_r=2.5, standardize=True,
    memory='nilearn_cache', memory_level=1, verbose=2)
masker.fit()

subject_time_series = []
func_filenames = adhd_dataset.func
confound_filenames = adhd_dataset.confounds
for func_filename, confound_filename in zip(func_filenames,
                                            confound_filenames):
    print("Processing file %s" % func_filename)

    # Computing some confounds
    hv_confounds = mem.cache(image.high_variance_confounds)(
        func_filename)

    region_ts = masker.transform(func_filename,
                                 confounds=[hv_confounds, confound_filename])
    subject_time_series.append(region_ts)


##############################################################################
# Computing group-sparse precision matrices
# ------------------------------------------
from nilearn.connectome import GroupSparseCovarianceCV
gsc = GroupSparseCovarianceCV(verbose=2)
gsc.fit(subject_time_series)

from sklearn import covariance
gl = covariance.GraphLassoCV(verbose=2)
Example #52
0
def robust_single_linkage(X,
                          cut,
                          k=5,
                          alpha=1.4142135623730951,
                          gamma=5,
                          metric='euclidean',
                          algorithm='best',
                          memory=Memory(cachedir=None, verbose=0),
                          leaf_size=40,
                          core_dist_n_jobs=4,
                          **kwargs):
    """Perform robust single linkage clustering from a vector array
    or distance matrix.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    cut : float
        The reachability distance value to cut the cluster heirarchy at
        to derive a flat cluster labelling.

    k : int, optional (default=5)
        Reachability distances will be computed with regard to the `k`
        nearest neighbors.

    alpha : float, optional (default=np.sqrt(2))
        Distance scaling for reachability distance computation. Reachability
        distance is computed as
        $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.

    gamma : int, optional (default=5)
        Ignore any clusters in the flat clustering with size less than gamma,
        and declare points in such clusters as noise points.

    metric : string, or callable, optional (default='euclidean')
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.

    algorithm : string, optional (default='best')
        Exactly which algorithm to use; hdbscan has variants specialised
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of
        the data. You can force other options if you believe you know
        better. Options are:
            * ``generic``
            * ``best``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    memory : Instance of joblib.Memory or string (optional)
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    leaf_size : int, optional (default=40)
        Leaf size for trees responsible for fast nearest
        neighbour queries.

    core_dist_n_jobs : int, optional
        Number of parallel jobs to run in core distance computations (if
        supported by the specific algorithm). For ``core_dist_n_jobs``
        below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
        (default 4)

    Returns
    -------
    labels : ndarray, shape (n_samples, )
        Cluster labels for each point.  Noisy samples are given the label -1.

    single_linkage_tree : ndarray, shape (n_samples - 1, 4)
        The single linkage tree produced during clustering in scipy
        hierarchical clustering format
        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

    References
    ----------
    .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
       cluster tree. In Advances in Neural Information Processing Systems
       (pp. 343-351).

    """

    if not isinstance(k, int) or k < 1:
        raise ValueError('k must be an integer greater than zero!')

    if not isinstance(alpha, float) or alpha < 1.0:
        raise ValueError('alpha must be a float greater than or equal to 1.0!')

    if not isinstance(gamma, int) or gamma < 1:
        raise ValueError('gamma must be an integer greater than zero!')

    if not isinstance(leaf_size, int) or leaf_size < 1:
        raise ValueError('Leaf size must be at least one!')

    if metric == 'minkowski':
        if 'p' not in kwargs or kwargs['p'] is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if kwargs['p'] < 0:
            raise ValueError('Minkowski metric with negative p value is not'
                             ' defined!')

    X = check_array(X, accept_sparse='csr')
    if isinstance(memory, six.string_types):
        memory = Memory(cachedir=memory, verbose=0)

    if algorithm != 'best':
        if algorithm == 'generic':
            single_linkage_tree = memory.cache(_rsl_generic)(X, k, alpha,
                                                             metric, **kwargs)
        elif algorithm == 'prims_kdtree':
            single_linkage_tree = memory.cache(_rsl_prims_kdtree)(X, k, alpha,
                                                                  metric,
                                                                  **kwargs)
        elif algorithm == 'prims_balltree':
            single_linkage_tree = memory.cache(_rsl_prims_balltree)(X, k,
                                                                    alpha,
                                                                    metric,
                                                                    **kwargs)
        elif algorithm == 'boruvka_kdtree':
            single_linkage_tree = \
                memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, leaf_size,
                                                  core_dist_n_jobs, **kwargs)
        elif algorithm == 'boruvka_balltree':
            single_linkage_tree = \
                memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, leaf_size,
                                                    core_dist_n_jobs, **kwargs)
        else:
            raise TypeError('Unknown algorithm type %s specified' % algorithm)
    else:
        if issparse(X) or metric not in FAST_METRICS:
            # We can't do much with sparse matrices ...
            single_linkage_tree = memory.cache(_rsl_generic)(X, k, alpha,
                                                             metric, **kwargs)
        elif metric in KDTree.valid_metrics:
            # Need heuristic to decide when to go to boruvka;
            # still debugging for now
            if X.shape[1] > 128:
                single_linkage_tree = memory.cache(_rsl_prims_kdtree)(X, k,
                                                                      alpha,
                                                                      metric,
                                                                      **kwargs)
            else:
                single_linkage_tree = \
                    memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric,
                                                        leaf_size,
                                                        core_dist_n_jobs,
                                                        **kwargs)
        else:  # Metric is a valid BallTree metric
            # Need heuristic to decide when to go to boruvka;
            # still debugging for now
            if X.shape[1] > 128:
                single_linkage_tree = memory.cache(_rsl_prims_kdtree)(X, k,
                                                                      alpha,
                                                                      metric,
                                                                      **kwargs)
            else:
                single_linkage_tree = \
                    memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric,
                                                        leaf_size,
                                                        core_dist_n_jobs,
                                                        **kwargs)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree.to_numpy()
Example #53
0
        # set func
        subject_data.func = [x for x in session_func if subject_id in x]
        assert len(subject_data.func) == 1
        subject_data.func = subject_data.func[0]

        # set anat
        subject_data.anat = [x for x in session_anat if subject_id in x]
        assert len(subject_data.anat) == 1
        subject_data.anat = subject_data.anat[0]

        # set subject output directory
        subject_data.output_dir = "/tmp/%s" % subject_id

        subject_data.sanitize(deleteorient=True, niigz2nii=False)

        yield (subject_data.subject_id, subject_data.func[0],
               subject_data.anat)

# spm auditory demo
mem.cache(_run_demo)(*_spm_auditory_factory())

# NYU rest demo
for subject_id, func, anat in _nyu_rest_factory():
    print "%s +++NYU rest %s+++\r\n" % ("\t" * 5, subject_id)
    mem.cache(_run_demo)(func, anat)

# ABIDE demo
for subject_id, func, anat in _abide_factory():
    print "%s +++ABIDE %s+++\r\n" % ("\t" * 5, subject_id)
    mem.cache(_run_demo)(func, anat)
Example #54
0
def do_subject_glm(subject_data):
    """FE analysis for a single subject."""
    subject_id = subject_data['subject_id']
    output_dir = subject_data["output_dir"]
    func_files = subject_data['func']
    anat = subject_data['anat']
    onset_files = subject_data['onset']
    # subject_id = os.path.basename(subject_dir)
    # subject_output_dir = os.path.join(output_dir, subject_id)
    mem = Memory(os.path.join(output_dir, "cache"))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # glob files: anat, session func files, session onset files
    # anat = glob.glob(os.path.join(subject_dir, anat_wildcard))
    # assert len(anat) == 1
    # anat = anat[0]
    # onset_files = sorted([glob.glob(os.path.join(subject_dir, session))[0]
    #                       for session in session_onset_wildcards])
    # func_files = sorted([sorted(glob.glob(os.path.join(subject_dir, session)))
    #                      for session in session_func_wildcards])

    ### Preprocess data #######################################################
    if 0:
        subject_data = mem.cache(do_subject_preproc)(
            dict(func=func_files, anat=anat, output_dir=output_dir))
        func_files = subject_data['func']
        anat = subject_data['anat']

        # reslice func images
        func_files = [mem.cache(reslice_vols)(
            sess_func,
            target_affine=nibabel.load(sess_func[0]).get_affine())
                      for sess_func in func_files]

    ### GLM: loop on (session_bold, onse_file) pairs over the various sessions
    design_matrices = []
    for session, (func_file, onset_file) in enumerate(zip(func_files,
                                                          onset_files)):
        if isinstance(func_file, str):
            bold = nibabel.load(func_file)
        else:
            if len(func_file) == 1:
                func_file = func_file[0]
                bold = nibabel.load(func_file)
                assert len(bold.shape) == 4
                n_scans = bold.shape[-1]
                del bold
            else:
                n_scans = len(func_file)
        frametimes = np.linspace(0, (n_scans - 1) * tr, n_scans)
        conditions, onsets, durations, amplitudes = parse_onset_file(
            onset_file)
        onsets *= tr
        durations *= tr
        paradigm = BlockParadigm(con_id=conditions, onset=onsets,
                                 duration=durations, amplitude=amplitudes)
        design_matrices.append(make_dmtx(frametimes,
                                         paradigm, hrf_model=hrf_model,
                                         drift_model=drift_model,
                                         hfcut=hfcut))

    # specify contrasts
    n_columns = len(design_matrices[0].names)
    contrasts = {}
    for i in xrange(paradigm.n_conditions):
        contrasts['%s' % design_matrices[0].names[2 * i]
                  ] = np.eye(n_columns)[2 * i]

    # more interesting contrasts
    contrasts['faces-scrambled'] = contrasts['faces'
                                             ] - contrasts['scrambled']
    contrasts['scrambled-faces'] = -contrasts['faces-scrambled']
    contrasts['effects_of_interest'] = contrasts['faces'
                                                 ] + contrasts['scrambled']

    # effects of interest F-test
    diff_contrasts = []
    for i in xrange(paradigm.n_conditions - 1):
        a = contrasts[design_matrices[0].names[2 * i]]
        b = contrasts[design_matrices[0].names[2 * (i + 1)]]
        diff_contrasts.append(a - b)
    contrasts["diff"] = diff_contrasts

    # fit GLM
    print 'Fitting a GLM (this takes time)...'
    fmri_glm = FMRILinearModel([nibabel.concat_images(sess_func,
                                                      check_affines=False)
                                for sess_func in func_files],
                               [design_matrix.matrix
                                for design_matrix in design_matrices],
                               mask='compute'
                               )
    fmri_glm.fit(do_scaling=True, model='ar1')

    # save computed mask
    mask_path = os.path.join(output_dir, "mask.nii.gz")

    print "Saving mask image %s" % mask_path
    nibabel.save(fmri_glm.mask, mask_path)

    # compute contrasts
    z_maps = {}
    effects_maps = {}
    for contrast_id, contrast_val in contrasts.iteritems():
        print "\tcontrast id: %s" % contrast_id
        if np.ndim(contrast_val) > 1:
            contrast_type = "t"
        else:
            contrast_type = "F"
        z_map, t_map, effects_map, var_map = fmri_glm.contrast(
            [contrast_val] * 2,
            con_id=contrast_id,
            contrast_type=contrast_type,
            output_z=True,
            output_stat=True,
            output_effects=True,
            output_variance=True
            )

        # store stat maps to disk
        for map_type, out_map in zip(['z', 't', 'effects', 'variance'],
                                  [z_map, t_map, effects_map, var_map]):
            map_dir = os.path.join(
                output_dir, '%s_maps' % map_type)
            if not os.path.exists(map_dir):
                os.makedirs(map_dir)
            map_path = os.path.join(
                map_dir, '%s.nii.gz' % contrast_id)
            print "\t\tWriting %s ..." % map_path
            nibabel.save(out_map, map_path)

            # collect zmaps for contrasts we're interested in
            if map_type == 'z':
                z_maps[contrast_id] = map_path
            if map_type == 'effects':
                effects_maps[contrast_id] = map_path

    return subject_id, anat, effects_maps, z_maps, contrasts, fmri_glm.mask
Example #55
0
                map_dir, '%s.nii.gz' % contrast_id)
            print "\t\tWriting %s ..." % map_path
            nibabel.save(out_map, map_path)

            # collect zmaps for contrasts we're interested in
            if map_type == 'z':
                z_maps[contrast_id] = map_path
            if map_type == 'effects':
                effects_maps[contrast_id] = map_path

    return subject_id, anat, effects_maps, z_maps, contrasts, fmri_glm.mask


if __name__ == "__main__":
    mem = Memory(os.path.join(output_dir, "cache"))
    first_level_glms = map(mem.cache(do_subject_glm), subject_dirs)

    # plot stats (per subject)
    import matplotlib.pyplot as plt
    from nilearn.plotting import plot_stat_map
    all_masks = []
    all_effects_maps = []
    for (subject_id, anat, effects_maps, z_maps,
         contrasts, mask) in first_level_glms:
        all_masks.append(mask)
        anat_img = nibabel.load(anat)
        z_map = nibabel.load(z_maps.values()[0])
        all_effects_maps.append(effects_maps)
        for contrast_id, z_map in z_maps.iteritems():
            plot_stat_map(z_map, black_bg=True, threshold=2.3,
                          title="%s: %s" % (subject_id, contrast_id))
class FirstLevelModel(BaseEstimator, TransformerMixin, CacheMixin):
    """ Implementation of the General Linear Model for single session fMRI data

    Parameters
    ----------

    t_r: float
        This parameter indicates repetition times of the experimental runs.
        In seconds. It is necessary to correctly consider times in the design
        matrix. This parameter is also passed to nilearn.signal.clean.
        Please see the related documentation for details.

    slice_time_ref: float, optional (default 0.)
        This parameter indicates the time of the reference slice used in the
        slice timing preprocessing step of the experimental runs. It is
        expressed as a percentage of the t_r (time repetition), so it can have
        values between 0. and 1.

    hrf_model : string, optional
        This parameter specifies the hemodynamic response function (HRF) for
        the design matrices. It can be 'canonical', 'canonical with derivative'
        or 'fir'.

    drift_model : string, optional
        This parameter specifies the desired drift model for the design
        matrices. It can be 'polynomial', 'cosine' or 'blank'.

    period_cut : float, optional
        This parameter specifies the cut period of the low-pass filter in
        seconds for the design matrices.

    drift_order : int, optional
        This parameter specifices the order of the drift model (in case it is
        polynomial) for the design matrices.

    fir_delays : array of shape(n_onsets) or list, optional
        In case of FIR design, yields the array of delays used in the FIR
        model, in seconds.

    min_onset : float, optional
        This parameter specifies the minimal onset relative to the design
        (in seconds). Events that start before (slice_time_ref * t_r +
        min_onset) are not considered.

    mask: Niimg-like, NiftiMasker or MultiNiftiMasker object, optional,
        Mask to be used on data. If an instance of masker is passed,
        then its mask will be used. If no mask is given,
        it will be computed automatically by a MultiNiftiMasker with default
        parameters.

    target_affine: 3x3 or 4x4 matrix, optional
        This parameter is passed to nilearn.image.resample_img. Please see the
        related documentation for details.

    target_shape: 3-tuple of integers, optional
        This parameter is passed to nilearn.image.resample_img. Please see the
        related documentation for details.

    smoothing_fwhm: float, optional
        If smoothing_fwhm is not None, it gives the size in millimeters of the
        spatial smoothing to apply to the signal.

    memory: string, optional
        Path to the directory used to cache the masking process and the glm
        fit. By default, no caching is done. Creates instance of joblib.Memory.

    memory_level: integer, optional
        Rough estimator of the amount of memory used by caching. Higher value
        means more memory for caching.

    standardize : boolean, optional
        If standardize is True, the time-series are centered and normed:
        their variance is put to 1 in the time dimension.

    signal_scaling: False, int or (int, int), optional,
        If not False, fMRI signals are scaled to the mean value of scaling_axis
        given, which can be 0, 1 or (0, 1). 0 refers to mean scaling each voxel
        with respect to time, 1 refers to mean scaling each time point with
        respect to all voxels and (0, 1) refers to scaling with respect to
        voxels and time, which is known as grand mean scaling.
        Incompatible with standardize (standardize=False is enforced when
        signal_scaling is not False).

    noise_model : {'ar1', 'ols'}, optional
        The temporal variance model. Defaults to 'ar1'

    verbose : integer, optional
        Indicate the level of verbosity. By default, nothing is printed.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs', -2 'all CPUs but one', and so on.

    minimize_memory : boolean, optional
        Gets rid of some variables on the model fit results that are not
        necessary for contrast computation and would only be useful for
        further inspection of model details. This has an important impact
        on memory consumption. True by default.

    Attributes
    ----------
    labels : array of shape (n_voxels,),
        a map of values on voxels used to identify the corresponding model

    results : dict,
        with keys corresponding to the different labels values
        values are RegressionResults instances corresponding to the voxels
    """
    def __init__(self, t_r=None, slice_time_ref=0., hrf_model='glover',
                 drift_model='cosine', period_cut=128, drift_order=1,
                 fir_delays=[0], min_onset=-24, mask=None, target_affine=None,
                 target_shape=None, smoothing_fwhm=None, memory=Memory(None),
                 memory_level=1, standardize=False, signal_scaling=0,
                 noise_model='ar1', verbose=1, n_jobs=1,
                 minimize_memory=True):
        # design matrix parameters
        self.t_r = t_r
        self.slice_time_ref = slice_time_ref
        self.hrf_model = hrf_model
        self.drift_model = drift_model
        self.period_cut = period_cut
        self.drift_order = drift_order
        self.fir_delays = fir_delays
        self.min_onset = min_onset
        # glm parameters
        self.mask = mask
        self.target_affine = target_affine
        self.target_shape = target_shape
        self.smoothing_fwhm = smoothing_fwhm
        if isinstance(memory, _basestring):
            self.memory = Memory(memory)
        else:
            self.memory = memory
        self.memory_level = memory_level
        self.standardize = standardize
        if signal_scaling in [0, 1, (0, 1)]:
            self.scaling_axis = signal_scaling
            self.signal_scaling = True
            self.standardize = False
        elif signal_scaling is False:
            self.signal_scaling = signal_scaling
        else:
            raise ValueError('signal_scaling must be "False", "0", "1"'
                             ' or "(0, 1)"')
        self.noise_model = noise_model
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.minimize_memory = minimize_memory
        # attributes
        self.labels_ = None
        self.results_ = None

    def fit(self, run_imgs, paradigms=None, confounds=None,
            design_matrices=None):
        """ Fit the GLM

        For each run:
        1. create design matrix X
        2. do a masker job: fMRI_data -> Y
        3. fit regression to (Y, X)

        Parameters
        ----------
        run_imgs: Niimg-like object or list of Niimg-like objects,
            See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg.
            Data on which the GLM will be fitted. If this is a list,
            the affine is considered the same for all.

        paradigms: pandas Dataframe or string or list of pandas DataFrames or
                   strings,
            fMRI paradigms used to build design matrices. One paradigm expected
            per run_img. Ignored in case designs is not None.

        confounds: pandas Dataframe or string or list of pandas DataFrames or
                   strings,
            Each column in a DataFrame corresponds to a confound variable
            to be included in the regression model of the respective run_img.
            The number of rows must match the number of volumes in the
            respective run_img. Ignored in case designs is not None.

        design_matrices: pandas DataFrame or list of pandas DataFrames,
            Design matrices that will be used to fit the GLM.
        """
        # Check arguments
        # Check imgs type
        if not isinstance(run_imgs, (list, tuple)):
            run_imgs = [run_imgs]
        for rimg in run_imgs:
            if not isinstance(rimg, (_basestring, Nifti1Image)):
                raise ValueError('run_imgs must be Niimg-like object or list'
                                 ' of Niimg-like objects')
        # check all information necessary to build design matrices is available
        if design_matrices is None:
            if paradigms is None:
                raise ValueError('paradigms or design matrices must be provided')
            if self.t_r is None:
                raise ValueError('t_r not given to FirstLevelModel object'
                                 ' to compute design from paradigm')
        else:
            design_matrices = _check_run_tables(run_imgs, design_matrices,
                                                'design_matrices')
        # check the number of paradigm and confound files match number of runs
        # Also check paradigm and confound files can be loaded as DataFrame
        if paradigms is not None:
            paradigms = _check_run_tables(run_imgs, paradigms, 'paradigms')

        if confounds is not None:
            confounds = _check_run_tables(run_imgs, confounds, 'confounds')

        # Learn the mask
        if not isinstance(self.mask, NiftiMasker):
            self.masker_ = NiftiMasker(
                mask_img=self.mask, smoothing_fwhm=self.smoothing_fwhm,
                target_affine=self.target_affine,
                standardize=self.standardize, mask_strategy='epi',
                t_r=self.t_r, memory=self.memory,
                verbose=max(0, self.verbose - 1),
                target_shape=self.target_shape,
                memory_level=self.memory_level)
        else:
            self.masker_ = clone(self.mask)
            for param_name in ['target_affine', 'target_shape',
                               'smoothing_fwhm', 'low_pass', 'high_pass',
                               't_r', 'memory', 'memory_level']:
                our_param = getattr(self, param_name)
                if our_param is None:
                    continue
                if getattr(self.masker_, param_name) is not None:
                    warn('Parameter %s of the masker overriden' % param_name)
                setattr(self.masker_, param_name, our_param)
        self.masker_.fit(run_imgs[0])

        # For each run fit the model and keep only the regression results.
        self.labels_, self.results_, self.design_matrices_ = [], [], []
        n_runs = len(run_imgs)
        t0 = time.time()
        for run_idx, run_img in enumerate(run_imgs):
            # Report progress
            if self.verbose > 0:
                percent = float(run_idx) / n_runs
                percent = round(percent * 100, 2)
                dt = time.time() - t0
                # We use a max to avoid a division by zero
                if run_idx == 0:
                    remaining = 'go take a coffee, a big one'
                else:
                    remaining = (100. - percent) / max(0.01, percent) * dt
                    remaining = '%i seconds remaining' % remaining
                sys.stderr.write(" " * 100 + "\r")
                sys.stderr.write(
                    "Computing run %d out of %d runs (%s)\r"
                    % (run_idx, n_runs, remaining))

            # Build the experimental design for the glm
            run_img = check_niimg(run_img, ensure_ndim=4)
            if design_matrices is None:
                n_scans = run_img.get_data().shape[3]
                if confounds is not None:
                    confounds_matrix = confounds[run_idx].values
                    if confounds_matrix.shape[0] != n_scans:
                        raise ValueError('Rows in confounds does not match'
                                         'n_scans in run_img at index %d'
                                         % (run_idx,))
                    confounds_names = confounds[run_idx].columns
                else:
                    confounds_matrix = None
                    confounds_names = None
                start_time = self.slice_time_ref * self.t_r
                end_time = (n_scans - 1 + self.slice_time_ref) * self.t_r
                frame_times = np.linspace(start_time, end_time, n_scans)
                design = make_design_matrix(frame_times, paradigms[run_idx],
                                            self.hrf_model, self.drift_model,
                                            self.period_cut, self.drift_order,
                                            self.fir_delays, confounds_matrix,
                                            confounds_names, self.min_onset)
            else:
                design = design_matrices[run_idx]
            self.design_matrices_.append(design)

            # Compute GLM
            Y = self.masker_.transform(run_img)
            if self.signal_scaling:
                Y, _ = mean_scaling(Y, self.scaling_axis)
            if self.memory is not None:
                mem_glm = self.memory.cache(run_glm)
            else:
                mem_glm = run_glm
            labels, results = mem_glm(Y, design,
                                      noise_model=self.noise_model,
                                      bins=100, n_jobs=self.n_jobs)
            self.labels_.append(labels)
            # We save memory if inspecting model details is not necessary
            if self.minimize_memory:
                for key in results:
                    results[key] = SimpleRegressionResults(results[key])
            self.results_.append(results)
            del Y

        # Report progress
        if self.verbose > 0:
            sys.stderr.write("\nComputation of %d runs done in %i seconds\n"
                             % (n_runs, time.time() - t0))

        return self

    def compute_contrast(self, contrast_def, contrast_name=None,
                         stat_type=None, output_type='z_score'):
        """Generate different outputs corresponding to
        the contrasts provided e.g. z_map, t_map, effects and variance.
        In multi-session case, outputs the fixed effects map.

        Parameters
        ----------
        contrast_def : array or list of arrays of shape (n_col) or (n_run, n_col)
            where ``n_col`` is the number of columns of the design matrix,
            (one array per run). If only one array is provided when there
            are several runs, it will be assumed that the same contrast is
            desired for all runs

        contrast_name : str, optional
            name of the contrast

        stat_type : {'t', 'F'}, optional
            type of the contrast

        output_type : str, optional
            Type of the output map. Can be 'z_score', 'stat', 'p_value',
            'effect_size' or 'effect_variance'

        Returns
        -------
        output_image : Nifti1Image
            The desired output image

        """
        if self.labels_ is None or self.results_ is None:
            raise ValueError('The model has not been fit yet')

        if isinstance(contrast_def, np.ndarray):
            con_vals = [contrast_def]
        elif isinstance(contrast_def, (list, tuple)):
            con_vals = contrast_def
            for cidx, con in enumerate(contrast_def):
                if not isinstance(con, np.ndarray):
                    raise ValueError('contrast_def at index %i is not an'
                                     ' array' % cidx)
        else:
            raise ValueError('contrast_def must be an array or list of arrays')
        n_runs = len(self.labels_)
        if len(con_vals) != n_runs:
            warn('One contrast given, assuming it for all %d runs' % n_runs)
            con_vals = con_vals * n_runs
        if isinstance(output_type, _basestring):
            if output_type not in ['z_score', 'stat', 'p_value', 'effect_size',
                                   'effect_variance']:
                raise ValueError('output_type must be one of "z_score", "stat",'
                                 ' "p_value","effect_size" or "effect_variance"')
        else:
            raise ValueError('output_type must be one of "z_score", "stat",'
                             ' "p_value","effect_size" or "effect_variance"')

        if self.memory is not None:
            arg_ignore = ['labels', 'results']
            mem_contrast = self.memory.cache(_fixed_effect_contrast,
                                             ignore=arg_ignore)
        else:
            mem_contrast = _fixed_effect_contrast
        contrast = mem_contrast(self.labels_, self.results_, con_vals,
                                stat_type)

        estimate_ = getattr(contrast, output_type)()
        # Prepare the returned images
        output = self.masker_.inverse_transform(estimate_)
        if contrast_name is None:
            contrast_name = str(con_vals)
        output.get_header()['descrip'] = (
            '%s of contrast %s' % (output_type, contrast_name))
        return output
    if output_dir is not None:
        with open(join(debug_folder, 'score'), 'w+') as f:
            f.write('score : %.4f' % score)

    return score


output_dir = expanduser(join('~/output/dl_recommender/',
                             datetime.datetime.now().strftime('%Y-%m-%d_%H'
                                                              '-%M-%S')))
os.makedirs(output_dir)

random_state = check_random_state(0)
mem = Memory(cachedir=expanduser("~/cache"), verbose=10)
X_csr = mem.cache(fetch_ml_10m)(expanduser('~/data/own/ml-10M100K'),
                               remove_empty=True)

permutation = random_state.permutation(X_csr.shape[0])

X_csr = X_csr[permutation]

X, y = array_to_fm_format(X_csr)

uniform_split = ShuffleSplit(n_iter=4,
                             test_size=.25, random_state=random_state)

fm_decoder = FMDecoder(n_samples=X_csr.shape[0], n_features=X_csr.shape[1])

base_estimator = BaseRecommender(fm_decoder)

convex_fm = ConvexFM(fit_linear=True, alpha=0, max_rank=20,
Example #58
0
        n_sources, n_times = mean_stc.data.shape
        X = np.empty((len(stcs), n_sources, n_times))
        for i, stc in enumerate(stcs):
            if len(times) == len(stc.times):
                X[i] = stc.data
        mean_stc._data = np.mean(X, axis=0)
        return mean_stc, X
        print "Jane here"

#X1, X2 are the full time,vertices,subject matrices; mean_stc1 and mean_stc2 are the grand-avgs

    mean_stc1, X1 = average_stcs(stcs1)
    mean_stc2, X2 = average_stcs(stcs2)
    return mean_stc1, X1, mean_stc2, X2

mean_stc1, X1, mean_stc2, X2 = mem.cache(load_data)(stcs1_fname, stcs2_fname,
                                                    dec)

template_stc = copy.deepcopy(mean_stc1)
stc_diff = copy.deepcopy(template_stc)
stc_diff._data = mean_stc2.data - mean_stc1.data  ##Stc cond 2- Stc cond 1
stc_diff.save(
    '/cluster/kuperberg/SemPrMM/MEG/results/source_space/cluster_stats/' +
    prefix + 'diff_of_means')

if time_interval is not None:  # squash time interval
    tmin, tmax = time_interval
    times = mean_stc1.times
    mask = (times >= tmin) & (times <= tmax)
    X1 = np.mean(X1[:, :, mask], axis=2)[:, :, None]
    X2 = np.mean(X2[:, :, mask], axis=2)[:, :, None]
    template_stc = copy.deepcopy(template_stc)
Example #59
0
    def fit(self, X, y=None):
        """Fit the hierarchical clustering on the data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The samples a.k.a. observations.

        Returns
        -------
        self
        """
        X = check_array(X, ensure_min_samples=2, estimator=self)
        memory = self.memory
        if isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)

        if self.n_clusters <= 0:
            raise ValueError("n_clusters should be an integer greater than 0."
                             " %s was provided." % str(self.n_clusters))

        if self.linkage == "ward" and self.affinity != "euclidean":
            raise ValueError("%s was provided as affinity. Ward can only "
                             "work with euclidean distances." %
                             (self.affinity, ))

        if self.linkage not in _TREE_BUILDERS:
            raise ValueError("Unknown linkage type %s."
                             "Valid options are %s" %
                             (self.linkage, _TREE_BUILDERS.keys()))
        tree_builder = _TREE_BUILDERS[self.linkage]

        connectivity = self.connectivity
        if self.connectivity is not None:
            if callable(self.connectivity):
                connectivity = self.connectivity(X)
            connectivity = check_array(connectivity,
                                       accept_sparse=['csr', 'coo', 'lil'])

        n_samples = len(X)
        compute_full_tree = self.compute_full_tree
        if self.connectivity is None:
            compute_full_tree = True
        if compute_full_tree == 'auto':
            # Early stopping is likely to give a speed up only for
            # a large number of clusters. The actual threshold
            # implemented here is heuristic
            compute_full_tree = self.n_clusters < max(100, .02 * n_samples)
        n_clusters = self.n_clusters
        if compute_full_tree:
            n_clusters = None

        # Construct the tree
        kwargs = {}
        if self.linkage != 'ward':
            kwargs['linkage'] = self.linkage
            kwargs['affinity'] = self.affinity
        if self.return_distance:
            self.children_, self.n_components_, self.n_leaves_, parents, \
                self.distances = \
                memory.cache(tree_builder)(X, connectivity,
                                           n_components=self.n_components,
                                           n_clusters=n_clusters,
                                           return_distance=True,
                                           **kwargs)
        else:
            self.children_, self.n_components_, self.n_leaves_, parents = \
                memory.cache(tree_builder)(X, connectivity,
                                           n_components=self.n_components,
                                           n_clusters=n_clusters,
                                           **kwargs)
        # Cut the tree
        if compute_full_tree:
            self.labels_ = _hc_cut(self.n_clusters, self.children_,
                                   self.n_leaves_)
        else:
            labels = _hierarchical.hc_get_heads(parents, copy=False)
            # copy to avoid holding a reference on the original array
            labels = np.copy(labels[:n_samples])
            # Reasign cluster numbers
            self.labels_ = np.searchsorted(np.unique(labels), labels)
        return self