def _do_subject_slice_timing(subject_data, ref_slice=0,
                             slice_order="ascending", interleaved=False,
                             caching=True, write_output_images=2,
                             func_prefix=None, func_basenames=None,
                             ext=None):
    if func_prefix is None:
        func_prefix = PREPROC_OUTPUT_IMAGE_PREFICES['STC']
    if func_basenames is None:
        func_basenames = [get_basenames(func)
                          for func in subject_data.func]

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(
            subject_data.output_dir, 'cache_dir'), verbose=100)
    runner = lambda handle: mem.cache(handle) if caching else handle
    stc_output = []
    original_bold = subject_data.func
    for sess_func, sess_id in zip(subject_data.func,
                                  range(subject_data.n_sessions)):
        fmristc = runner(fMRISTC(slice_order=slice_order, ref_slice=ref_slice,
                                 interleaved=interleaved, verbose=True).fit)(
                                raw_data=sess_func)
        stc_output.append(runner(fmristc.transform)(
                sess_func,
                output_dir=subject_data.tmp_output_dir if (
                    write_output_images > 0) else None,
                basenames=func_basenames[sess_id],
                prefix=func_prefix, ext=ext))
    subject_data.func = stc_output
    del original_bold, fmristc
    if write_output_images > 1:
        subject_data.hardlink_output_files()
    return subject_data
    def _delete_orientation(self):
        """
        Delete orientation metadata. Garbage orientation metadata can lead to
        severe mis-registration trouble.

        """

        # prepare for smart caching
        if self.scratch is None:
            self.scratch = self.output_dir
        cache_dir = os.path.join(self.scratch, 'cache_dir')
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        mem = Memory(cachedir=cache_dir, verbose=5)

        # deleteorient for func
        for attr in ['n_sessions', 'session_output_dirs']:
            if getattr(self, attr) is None:
                warnings.warn("'%s' attribute of is None! Skipping" % attr)
                break
        else:
            self.func = [mem.cache(delete_orientation)(
                self.func[sess], self.session_output_dirs[sess])
                         for sess in range(self.n_sessions)]

        # deleteorient for anat
        if self.anat is not None:
            self.anat = mem.cache(delete_orientation)(
                self.anat, self.anat_output_dir)
    def test_multilabel(self):
        cache = Memory(cachedir=tempfile.gettempdir())
        cached_func = cache.cache(
            sklearn.datasets.make_multilabel_classification
        )
        X, Y = cached_func(
            n_samples=150,
            n_features=20,
            n_classes=5,
            n_labels=2,
            length=50,
            allow_unlabeled=True,
            sparse=False,
            return_indicator=True,
            return_distributions=False,
            random_state=1
        )
        X_train = X[:100, :]
        Y_train = Y[:100, :]
        X_test = X[101:, :]
        Y_test = Y[101:, ]

        data = {'X_train': X_train, 'Y_train': Y_train,
                'X_test': X_test, 'Y_test': Y_test}

        dataset_properties = {'multilabel': True}
        cs = SimpleClassificationPipeline(dataset_properties=dataset_properties).\
            get_hyperparameter_search_space()
        self._test_configurations(configurations_space=cs, data=data)
    def fit(self, X, y=None):
        """
        Compute agglomerative clustering.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)

        Returns
        -------
        self
        """

        memory = self.memory
        if isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)
        if self.n_landmarks is None:
            distances = memory.cache(pdist)(X, self.metric)
        else:
            if self.landmark_strategy == 'random':
                land_indices = check_random_state(self.random_state).randint(len(X), size=self.n_landmarks)
            else:
                land_indices = np.arange(len(X))[::(len(X)//self.n_landmarks)][:self.n_landmarks]
            distances = memory.cache(pdist)(X[land_indices], self.metric)

        tree = memory.cache(linkage)(distances, method=self.linkage)
        self.landmark_labels_ = fcluster(tree, criterion='maxclust', t=self.n_clusters) - 1

        if self.n_landmarks is None:
            self.landmarks_ = X
        else:
            self.landmarks_ = X[land_indices]

        return self
Example #5
0
def fetch_asirra(image_count=1000):
    partial_path = check_fetch_asirra()
    m = Memory(cachedir=partial_path, compress=6, verbose=0)
    load_func = m.cache(_fetch_asirra)
    images, target = load_func(partial_path, image_count=image_count)
    return Bunch(data=images.reshape(len(images), -1),
                 images=images, target=target,
                 DESCR="Asirra cats and dogs dataset")
Example #6
0
    def _cache(self, func, memory_level=1, **kwargs):
        """ Return a joblib.Memory object if necessary.

        The memory_level determines the level above which the wrapped
        function output is cached. By specifying a numeric value for
        this level, the user can to control the amount of cache memory
        used. This function will cache the function call or not
        depending on the cache level.

        Parameters
        ----------
        func: python function
            The function which output is to be cached.

        memory_level: integer
            The memory_level from which caching must be enabled for the wrapped
            function.

        Returns
        -------
        Either the original function, if there is no need to cache it (because
        the requested level is lower than the value given to _cache()) or a
        joblib.Memory object that wraps the function func.
        """

        # Creates attributes if they don't exist
        # This is to make creating them in __init__() optional.
        if not hasattr(self, "memory_level"):
            self.memory_level = 0
        if not hasattr(self, "memory"):
            self.memory = Memory(cachedir=None)

        # If cache level is 0 but a memory object has been provided, set
        # memory_level to 1 with a warning.
        if self.memory_level == 0:
            if (isinstance(self.memory, basestring)
                    or self.memory.cachedir is not None):
                warnings.warn("memory_level is currently set to 0 but "
                              "a Memory object has been provided. "
                              "Setting memory_level to 1.")
                self.memory_level = 1

        if self.memory_level < memory_level:
            mem = Memory(cachedir=None)
            return mem.cache(func, **kwargs)
        else:
            memory = self.memory
            if isinstance(memory, basestring):
                memory = Memory(cachedir=memory)
            if not isinstance(memory, Memory):
                raise TypeError("'memory' argument must be a string or a "
                                "joblib.Memory object.")
            if memory.cachedir is None:
                warnings.warn("Caching has been enabled (memory_level = %d) but no"
                              " Memory object or path has been provided (parameter"
                              " memory). Caching deactivated for function %s." %
                              (self.memory_level, func.func_name))
            return memory.cache(func, **kwargs)
def get_all_metadata(config=None, args=None):
  if config == None and args == None:
    raise Exception('Either config or args need to be not None')
  if config == None:
    config = get_config(args)
    
  class_meta  = read_class_meta(config.dataset.class_meta_file)
  attrib_meta_with_name = read_attribute_meta(config.dataset.attrib_meta_file)
  attrib_meta = attrib_meta_with_name.drop('class_name',axis=1)
  train_annos = read_image_annotations(config.dataset.train_annos_file)
  test_annos = read_image_annotations(config.dataset.test_annos_file,
                                      has_class_id=False)
  domain_meta = read_domain_meta(config.dataset.domain_meta_file)
  train_annos['class_name'] = np.array([class_meta.class_name[class_index] for 
                                         class_index in 
                                         train_annos.class_index])
#   test_annos['class_name'] = np.array([class_meta.class_name[class_index] for 
#                                          class_index in 
#                                          test_annos.class_index])

  # Prepand path to the dataset to each img_path
  train_annos.img_path = train_annos.img_path.apply(lambda x: config.dataset.main_path.joinpath(x).abspath())
  test_annos.img_path = test_annos.img_path.apply(lambda x: config.dataset.main_path.joinpath(x).abspath())

  # Filter the class meta and train/test annotations to just use the 
  # domains defined in config
  class_meta = class_meta[class_meta.domain_index.isin(config.dataset.domains)]
  train_annos = train_annos[train_annos.domain_index.isin(config.dataset.domains)]
  test_annos = test_annos[test_annos.domain_index.isin(config.dataset.domains)]
  
  
  # Create dev set
  dev_annos_train, dev_annos_test = create_dev_set(train_annos, 
                                                   config)

  # Should we use the dev set as the test set
  if config.dataset.dev_set.use:
    train_used, test_used = dev_annos_train, dev_annos_test 
  else:
    train_used, test_used = train_annos, test_annos
    
    
  if config.flip_images:
    memory = Memory(cachedir=config.cache_dir, verbose=config.logging.verbose)
    flip_func = memory.cache(create_flipped_images)
    train_used = flip_func(train_used, config)

  return ({'real_train_annos': train_annos,
           'real_test_annos': test_annos,
           'train_annos': train_used,
           'test_annos': test_used,
           'validation_annos': dev_annos_test, 
            'class_meta': class_meta,
            'domain_meta': domain_meta,
            'attrib_meta': attrib_meta,
            'attrib_meta_with_name': attrib_meta_with_name},
          config)
Example #8
0
def cache(func, memory, ref_memory_level=2, memory_level=1, **kwargs):
    """ Return a joblib.Memory object.

    The memory_level determines the level above which the wrapped
    function output is cached. By specifying a numeric value for
    this level, the user can to control the amount of cache memory
    used. This function will cache the function call or not
    depending on the cache level.

    Parameters
    ----------
    func: function
        The function which output is to be cached.

    memory: instance of joblib.Memory or string
        Used to cache the function call.

    ref_memory_level: int
        The reference memory_level used to determine if function call must
        be cached or not (if memory_level is larger than ref_memory_level
        the function is cached)

    memory_level: int
        The memory_level from which caching must be enabled for the wrapped
        function.

    kwargs: keyword arguments
        The keyword arguments passed to memory.cache

    Returns
    -------
    mem: joblib.MemorizedFunc
        object that wraps the function func. This object may be
        a no-op, if the requested level is lower than the value given
        to _cache()). For consistency, a joblib.Memory object is always
        returned.
    """

    if ref_memory_level <= memory_level or memory is None:
        memory = Memory(cachedir=None)
    else:
        memory = memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)
        if not isinstance(memory, memory_classes):
            raise TypeError("'memory' argument must be a string or a "
                            "joblib.Memory object. "
                            "%s %s was given." % (memory, type(memory)))
        if memory.cachedir is None:
            warnings.warn("Caching has been enabled (memory_level = %d) "
                          "but no Memory object or path has been provided"
                          " (parameter memory). Caching deactivated for "
                          "function %s." %
                          (ref_memory_level, func.func_name),
                          stacklevel=2)
    return memory.cache(func, **kwargs)
def _do_subject_coregister(
        subject_data, coreg_func_to_anat=True, caching=True,
        ext=None, write_output_images=2, func_basenames=None, func_prefix="",
        anat_basename=None, anat_prefix="", report=True, verbose=True):
    ref_brain = 'func'
    src_brain = 'anat'
    ref = subject_data.func[0]
    src = subject_data.anat
    if coreg_func_to_anat:
        ref_brain, src_brain = src_brain, ref_brain
        ref, src = src, ref

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(
                subject_data.output_dir, 'cache_dir'), verbose=100)
    runner = lambda handle: mem.cache(handle) if caching else handle

    # estimate realignment (affine) params for coreg
    coreg = runner(Coregister(verbose=verbose).fit)(ref, src)

    # apply coreg
    if coreg_func_to_anat:
        if func_basenames is None:
            func_basenames = [get_basenames(func)
                              for func in subject_data.func]
        coreg_func = []
        for sess_func, sess_id in zip(subject_data.func, range(
                subject_data.n_sessions)):
            coreg_func.append(runner(coreg.transform)(
                sess_func, output_dir=subject_data.tmp_output_dir if (
                    write_output_images == 2) else None,
                basenames=func_basenames[sess_id] if coreg_func_to_anat
                else anat_basename, prefix=func_prefix))
        subject_data.func = coreg_func
        src = load_vols(subject_data.func[0])[0]
    else:
        if anat_basename is None:
            anat_basename = get_basenames(subject_data.anat)
        subject_data.anat = runner(coreg.transform)(
            subject_data.anat, basename=anat_basename,
            output_dir=subject_data.tmp_output_dir if (
                write_output_images == 2) else None, prefix=anat_prefix,
            ext=ext)
        src = subject_data.anat

    # generate coregistration QA thumbs
    if report:
        subject_data.generate_coregistration_thumbnails(
            coreg_func_to_anat=coreg_func_to_anat, nipype=False)

    del coreg
    if write_output_images > 1:
        subject_data.hardlink_output_files()
    return subject_data
    def fit(self, niimgs, y=None):
        """Compute the mask corresponding to the data

        Parameters
        ----------
        niimgs: list of filenames or NiImages
            Data on which the mask must be calculated. If this is a list,
            the affine is considered the same for all.
        """

        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        # Load data (if filenames are given, load them)
        if self.verbose > 0:
            print "[%s.fit] Loading data from %s" % (
                self.__class__.__name__,
                utils._repr_niimgs(niimgs)[:200])
        data = []
        for niimg in niimgs:
            # Note that data is not loaded into memory at this stage
            # if niimg is a string
            data.append(utils.check_niimgs(niimg, accept_3d=True))

        # Compute the mask if not given by the user
        if self.mask is None:
            if self.verbose > 0:
                print "[%s.fit] Computing the mask" % self.__class__.__name__
            mask = memory.cache(masking.compute_multi_epi_mask,
                                ignore=['verbose'])(
                                    niimgs,
                                    connected=self.mask_connected,
                                    opening=self.mask_opening,
                                    lower_cutoff=self.mask_lower_cutoff,
                                    upper_cutoff=self.mask_upper_cutoff,
                                    n_jobs=self.n_jobs,
                                    verbose=(self.verbose - 1))
            self.mask_img_ = Nifti1Image(mask.astype(np.int), data[0].get_affine())
        else:
            self.mask_img_ = utils.check_niimg(self.mask)

        # If resampling is requested, resample also the mask
        # Resampling: allows the user to change the affine, the shape or both
        if self.verbose > 0:
            print "[%s.transform] Resampling mask" % self.__class__.__name__
        self.mask_img_ = memory.cache(resampling.resample_img)(
            self.mask_img_,
            target_affine=self.target_affine,
            target_shape=self.target_shape,
            copy=(self.target_affine is not None and
                  self.target_shape is not None))

        return self
 def get_multilabel(self):
     cache = Memory(cachedir=tempfile.gettempdir())
     cached_func = cache.cache(make_multilabel_classification)
     return cached_func(
         n_samples=100,
         n_features=10,
         n_classes=5,
         n_labels=5,
         return_indicator=True,
         random_state=1
     )
Example #12
0
def _check_memory(memory, verbose=0):
    """Function to ensure an instance of a joblib.Memory object.

    Parameters
    ----------
    memory: None or instance of joblib.Memory or str
        Used to cache the masking process.
        If a str is given, it is the path to the caching directory.

    verbose : int, optional (default 0)
        Verbosity level.

    Returns
    -------
    instance of joblib.Memory.
    """
    if memory is None:
        memory = Memory(cachedir=None, verbose=verbose)
    if isinstance(memory, _basestring):
        cache_dir = memory
        if nilearn.EXPAND_PATH_WILDCARDS:
            cache_dir = os.path.expanduser(cache_dir)

        # Perform some verifications on given path.
        split_cache_dir = os.path.split(cache_dir)
        if (len(split_cache_dir) > 1 and
                (not os.path.exists(split_cache_dir[0]) and
                    split_cache_dir[0] != '')):
            if (not nilearn.EXPAND_PATH_WILDCARDS and
                    cache_dir.startswith("~")):
                # Maybe the user want to enable expanded user path.
                error_msg = ("Given cache path parent directory doesn't "
                             "exists, you gave '{0}'. Enabling "
                             "nilearn.EXPAND_PATH_WILDCARDS could solve "
                             "this issue.".format(split_cache_dir[0]))
            elif memory.startswith("~"):
                # Path built on top of expanded user path doesn't exist.
                error_msg = ("Given cache path parent directory doesn't "
                             "exists, you gave '{0}' which was expanded "
                             "as '{1}' but doesn't exist either. Use "
                             "nilearn.EXPAND_PATH_WILDCARDS to deactivate "
                             "auto expand user path (~) behavior."
                             .format(split_cache_dir[0],
                                     os.path.dirname(memory)))
            else:
                # The given cache base path doesn't exist.
                error_msg = ("Given cache path parent directory doesn't "
                             "exists, you gave '{0}'."
                             .format(split_cache_dir[0]))
            raise ValueError(error_msg)

        memory = Memory(cachedir=cache_dir, verbose=verbose)
    return memory
Example #13
0
def cache(self, func, func_memory_level, **kwargs):
    """ Return a joblib.Memory object if necessary (depends on memory_level)

    The memory_level is a rough estimator of the amount of memory necessary
    to cache a function call. By specifying a numeric value for this level,
    the user will be able to control more or less the memory used on his
    computer. This function will cache the function call or not depending
    on the memory level. This is an helper to avoid code pasting.

    Parameters
    ----------

    self: python object
        The object containing information about caching. It must have a
        memory attribute (used if caching is necessary) and an integer
        memory_level attribute to determine if the function must be cached
        or not.

    func: python function
        The function that may be cached

    func_memory_level: integer
        The memory_level from which caching must be enabled.

    Returns
    -------

    Either the original function (if there is no need to cache it) or a
    joblib.Memory object that will be used to cache the function call.
    """
    # if memory level is 0 but a memory object is provided, put memory_level
    # to 1 with a warning
    if self.memory_level == 0:
        if hasattr(self, 'memory') and self.memory is not None \
                                   and (isinstance(self.memory, basestring)
                                   or self.memory.cachedir is not None):
            warnings.warn("memory_level is set to 0 but a Memory object has"
                    " been provided. Setting memory_level to 1.")
            self.memory_level = 1
    if self.memory_level < func_memory_level:
        return func
    else:
        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)
        if memory.cachedir is None:
            warnings.warn("Caching has been enabled (memory_level = %d) but no"
                          " Memory object or path has been provided (parameter"
                          " memory). Caching canceled for function %s." %
                          (self.memory_level, func.func_name))
        return memory.cache(func, **kwargs)
Example #14
0
    def _fit(self, X, y=None, **fit_params):
        self._validate_steps()
        # Setup the memory
        memory = self.memory
        if memory is None:
            memory = Memory(cachedir=None, verbose=0)
        elif isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)
        elif not isinstance(memory, Memory):
            raise ValueError("'memory' should either be a string or"
                             " a joblib.Memory instance, got"
                             " 'memory={!r}' instead.".format(memory))

        fit_transform_one_cached = memory.cache(_fit_transform_one)
        fit_sample_one_cached = memory.cache(_fit_sample_one)

        fit_params_steps = dict((name, {}) for name, step in self.steps
                                if step is not None)
        for pname, pval in six.iteritems(fit_params):
            step, param = pname.split('__', 1)
            fit_params_steps[step][param] = pval
        Xt = X
        yt = y
        for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
            if transformer is None:
                pass
            else:
                if memory.cachedir is None:
                    # we do not clone when caching is disabled to preserve
                    # backward compatibility
                    cloned_transformer = transformer
                else:
                    cloned_transformer = clone(transformer)
                # Fit or load from cache the current transfomer
                if (hasattr(cloned_transformer, "transform") or
                        hasattr(cloned_transformer, "fit_transform")):
                    Xt, fitted_transformer = fit_transform_one_cached(
                        cloned_transformer, None, Xt, yt,
                        **fit_params_steps[name])
                elif hasattr(cloned_transformer, "sample"):
                    Xt, yt, fitted_transformer = fit_sample_one_cached(
                        cloned_transformer, Xt, yt,
                        **fit_params_steps[name])
                # Replace the transformer of the step with the fitted
                # transformer. This is necessary when loading the transformer
                # from the cache.
                self.steps[step_idx] = (name, fitted_transformer)
        if self._final_estimator is None:
            return Xt, yt, {}
        return Xt, yt, fit_params_steps[self.steps[-1][0]]
Example #15
0
    def _niigz2nii(self):
        """
        Convert .nii.gz to .nii (crucial for SPM).

        """
        cache_dir = os.path.join(self.scratch, 'cache_dir')
        mem = Memory(cache_dir, verbose=100)
        self._sanitize_session_output_dirs()
        if not None in [self.func, self.n_sessions, self.session_output_dirs]:
            self.func = [mem.cache(do_niigz2nii)(
                self.func[sess], output_dir=self.session_output_dirs[sess])
                         for sess in range(self.n_sessions)]
        if not self.anat is None:
            self.anat = mem.cache(do_niigz2nii)(
                self.anat, output_dir=self.anat_output_dir)
Example #16
0
    def fit(self, niimgs, y=None):
        """Compute the mask corresponding to the data

        Parameters
        ----------
        niimgs: list of filenames or NiImages
            Data on which the mask must be calculated. If this is a list,
            the affine is considered the same for all.
        """

        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        # Load data (if filenames are given, load them)
        if self.verbose > 0:
            print "[%s.fit] Loading data" % self.__class__.__name__
        niimgs = utils.check_niimgs(niimgs, accept_3d=True)

        # Compute the mask if not given by the user
        if self.mask is None:
            if self.verbose > 0:
                print "[%s.fit] Computing the mask" % self.__class__.__name__
            mask = memory.cache(masking.compute_epi_mask)(
                niimgs.get_data(),
                connected=self.mask_connected,
                opening=self.mask_opening,
                lower_cutoff=self.mask_lower_cutoff,
                upper_cutoff=self.mask_upper_cutoff,
                verbose=(self.verbose - 1),
            )
            self.mask_ = Nifti1Image(mask.astype(np.int), niimgs.get_affine())
        else:
            self.mask_ = utils.check_niimg(self.mask)

            # If resampling is requested, resample also the mask
            # Resampling: allows the user to change the affine, the shape or both
        if self.verbose > 0:
            print "[%s.transform] Resampling mask" % self.__class__.__name__
        self.mask_ = memory.cache(resampling.resample_img)(
            self.mask_,
            target_affine=self.target_affine,
            target_shape=self.target_shape,
            copy=(self.target_affine is not None and self.target_shape is not None),
        )

        return self
Example #17
0
    def fit(self, X, y=None, get_rhos=False):
        '''
        Sets up for divergence estimation "from" new data "to" X.
        Builds FLANN indices for each bag, and maybe gets within-bag distances.

        Parameters
        ----------
        X : list of arrays or :class:`skl_groups.features.Features`
            The bags to search "to".

        get_rhos : boolean, optional, default False
            Compute within-bag distances :attr:`rhos_`. These are only needed
            for some divergence functions or if do_sym is passed, and they'll
            be computed (and saved) during :meth:`transform` if they're not
            computed here.

            If you're using Jensen-Shannon divergence, a higher max_K may
            be needed once it sees the number of points in the transformed bags,
            so the computation here might be wasted.
        '''
        self.features_ = X = as_features(X, stack=True, bare=True)

        # if we're using a function that needs to pick its K vals itself,
        # then we need to set max_K here. when we transform(), might have to
        # re-do this :|
        Ks = self._get_Ks()
        _, _, _, max_K, save_all_Ks, _ = _choose_funcs(
            self.div_funcs, Ks, X.dim, X.n_pts, None, self.version)

        if max_K >= X.n_pts.min():
            msg = "asked for K = {}, but there's a bag with only {} points"
            raise ValueError(msg.format(max_K, X.n_pts.min()))

        memory = self.memory
        if isinstance(memory, string_types):
            memory = Memory(cachedir=memory, verbose=0)

        self.indices_ = id = memory.cache(_build_indices)(X, self._flann_args())
        if get_rhos:
            self.rhos_ = _get_rhos(X, id, Ks, max_K, save_all_Ks, self.min_dist)
        elif hasattr(self, 'rhos_'):
            del self.rhos_
        return self
Example #18
0
    def transform(self, X):
        r'''
        Computes the divergences from X to :attr:`features_`.

        Parameters
        ----------
        X : list of bag feature arrays or :class:`skl_groups.features.Features`
            The bags to search "from".

        Returns
        -------
        divs : array of shape ``[len(div_funcs), len(Ks), len(X), len(features_)] + ([2] if do_sym else [])``
            The divergences from X to :attr:`features_`.
            ``divs[d, k, i, j]`` is the ``div_funcs[d]`` divergence
            from ``X[i]`` to ``fetaures_[j]`` using a K of ``Ks[k]``.
            If ``do_sym``, ``divs[d, k, i, j, 0]`` is
            :math:`D_{d,k}( X_i \| \texttt{features_}_j)` and
            ``divs[d, k, i, j, 1]`` is :math:`D_{d,k}(\texttt{features_}_j \| X_i)`.
        '''
        X = as_features(X, stack=True, bare=True)
        Y = self.features_

        Ks = np.asarray(self.Ks)

        if X.dim != Y.dim:
            msg = "incompatible dimensions: fit with {}, transform with {}"
            raise ValueError(msg.format(Y.dim, X.dim))

        memory = self.memory
        if isinstance(memory, string_types):
            memory = Memory(cachedir=memory, verbose=0)

        # ignore Y_indices to avoid slow pickling of them
        # NOTE: if the indices are approximate, then might not get the same
        #       results!
        est = memory.cache(_est_divs, ignore=['n_jobs', 'Y_indices', 'Y_rhos'])
        output, self.rhos_ = est(
            X, Y, self.indices_, getattr(self, 'rhos_', None),
            self.div_funcs, Ks,
            self.do_sym, self.clamp, self.version, self.min_dist,
            self._flann_args(), self._n_jobs)
        return output
def _do_subject_realign(subject_data, reslice=True, register_to_mean=False,
                        caching=True, hardlink_output=True, ext=None,
                        func_basenames=None, write_output_images=2,
                        report=True, func_prefix=None):
    if register_to_mean:
        raise NotImplementedError("Feature pending...")
    if func_prefix is None:
        func_prefix = PREPROC_OUTPUT_IMAGE_PREFICES['MC']
    if func_basenames is None:
        func_basenames = [get_basenames(func)
                          for func in subject_data.func]

    # prepare for smart caching
    if caching:
        mem = Memory(cachedir=os.path.join(
            subject_data.output_dir, 'cache_dir'), verbose=100)
    runner = lambda handle: mem.cache(handle) if caching else handle
    mrimc = runner(MRIMotionCorrection(
            n_sessions=subject_data.n_sessions, verbose=True).fit)(
        [sess_func for sess_func in subject_data.func])
    mrimc_output = runner(mrimc.transform)(
        reslice=reslice,
        output_dir=subject_data.tmp_output_dir if (
            write_output_images == 2) else None, ext=ext,
        prefix=func_prefix, basenames=func_basenames)
    subject_data.func = mrimc_output['realigned_images']
    subject_data.realignment_parameters = mrimc_output[
        'realignment_parameters']

    # generate realignment thumbs
    if report:
        subject_data.generate_realignment_thumbnails(nipype=False)

    # garbage collection
    del mrimc

    if write_output_images > 1:
        subject_data.hardlink_output_files()

    return subject_data
def _do_subject_smooth(subject_data, fwhm, prefix=None,
                       write_output_images=2, func_basenames=None,
                       concat=False, caching=True):
    if prefix is None:
        prefix = PREPROC_OUTPUT_IMAGE_PREFICES['smoothing']
    if func_basenames is None:
        func_basenames = [get_basenames(func) for func in subject_data.func]
    if caching:
        mem = Memory(cachedir=os.path.join(
                subject_data.output_dir, 'cache_dir'), verbose=100)
    sfunc = []
    for sess in range(subject_data.n_sessions):
        sess_func = subject_data.func[sess]
        _tmp = mem.cache(smooth_image)(sess_func,
                                   fwhm)
        if write_output_images == 2:
            _tmp = mem.cache(save_vols)(
                _tmp, subject_data.output_dir, basenames=func_basenames[sess],
                prefix=prefix, concat=concat)
        sfunc.append(_tmp)
    subject_data.func = sfunc
    return subject_data
Example #21
0
def fetch_asirra(image_count=1000):
    """

    Parameters
    ----------
    image_count : positive integer

    Returns
    -------
    data : Bunch
        Dictionary-like object with the following attributes :
        'images', the sample images, 'data', the flattened images,
        'target', the label for the image (0 for cat, 1 for dog),
        and 'DESCR' the full description of the dataset.
    """
    partial_path = check_fetch_asirra()
    m = Memory(cachedir=partial_path, compress=6, verbose=0)
    load_func = m.cache(_fetch_asirra)
    images, target = load_func(partial_path, image_count=image_count)
    return Bunch(data=images.reshape(len(images), -1),
                 images=images, target=target,
                 DESCR="Asirra cats and dogs dataset")
Example #22
0
    def fit(self, data, Y=None):
        if hasattr(data, 'copy'):
            # It's an array
            data = data.copy()
        else:
            # Probably a list
            data = copy.deepcopy(data)

        memory = self.memory
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory)

        pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
            delayed(subject_pca)(subject_data,
                                 n_components=self.n_components, mem=memory)
            for subject_data in data)
        pcas = np.concatenate(pcas, axis=1)

        if self.kurtosis_thr is None:
            group_maps = memory.cache(randomized_svd)(
                pcas, self.n_components)[0]
            group_maps = group_maps[:, :self.n_components]
            ica_maps = memory.cache(fastica)(group_maps, whiten=False,
                                             fun='cube',
                                             random_state=self.random_state)[2]
            ica_maps = ica_maps.T
        else:
            ica_maps = self._find_high_kurtosis(pcas, memory)

        del pcas
        self.maps_ = ica_maps
        if not self.maps_only:
            # Relearn the time series
            self.learn_from_maps(data)

        return self
Example #23
0
 def __init__(self, mask_img=None, smoothing_fwhm=None,
              memory=Memory(None), memory_level=1, verbose=0,
              n_jobs=1, minimize_memory=True):
     self.mask_img = mask_img
     self.smoothing_fwhm = smoothing_fwhm
     if isinstance(memory, _basestring):
         self.memory = Memory(memory)
     else:
         self.memory = memory
     self.memory_level = memory_level
     self.verbose = verbose
     self.n_jobs = n_jobs
     self.minimize_memory = minimize_memory
     self.second_level_input_ = None
     self.confounds_ = None
Example #24
0
 def __init__(self, t_r=None, slice_time_ref=0., hrf_model='glover',
              drift_model='cosine', period_cut=128, drift_order=1,
              fir_delays=[0], min_onset=-24, mask_img=None, target_affine=None,
              target_shape=None, smoothing_fwhm=None, memory=Memory(None),
              memory_level=1, standardize=False, signal_scaling=0,
              noise_model='ar1', verbose=0, n_jobs=1,
              minimize_memory=True, subject_label=None):
     # design matrix parameters
     self.t_r = t_r
     self.slice_time_ref = slice_time_ref
     self.hrf_model = hrf_model
     self.drift_model = drift_model
     self.period_cut = period_cut
     self.drift_order = drift_order
     self.fir_delays = fir_delays
     self.min_onset = min_onset
     # glm parameters
     self.mask_img = mask_img
     self.target_affine = target_affine
     self.target_shape = target_shape
     self.smoothing_fwhm = smoothing_fwhm
     if isinstance(memory, _basestring):
         self.memory = Memory(memory)
     else:
         self.memory = memory
     self.memory_level = memory_level
     self.standardize = standardize
     if signal_scaling is False:
         self.signal_scaling = signal_scaling
     elif signal_scaling in [0, 1, (0, 1)]:
         self.scaling_axis = signal_scaling
         self.signal_scaling = True
         self.standardize = False
     else:
         raise ValueError('signal_scaling must be "False", "0", "1"'
                          ' or "(0, 1)"')
     self.noise_model = noise_model
     self.verbose = verbose
     self.n_jobs = n_jobs
     self.minimize_memory = minimize_memory
     # attributes
     self.labels_ = None
     self.results_ = None
     self.subject_label = subject_label
Example #25
0

@pytest.fixture
def app_notest():
    tapp = fd_app(cache_dir)
    tapp.config['TESTING'] = False

    client = tapp.test_client()
    client.post_check = app_call_wrapper(client.post)
    client.get_check = app_call_wrapper(client.get)
    client.delete_check = app_call_wrapper(client.delete)
    return client


memory = Memory(cachedir=os.path.join(cache_dir, '_joblib_cache',
                                      str(os.getpid())),
                verbose=0)

#=============================================================================#
#
#                     Feature extraction
#
#=============================================================================#


def get_features(app, hashed=True, metadata_fields='data_dir'):
    method = V01 + "/feature-extraction/"
    pars = {"use_hashing": hashed}
    if metadata_fields == 'data_dir':
        pars["data_dir"] = data_dir
    elif metadata_fields == 'dataset_definition':
    plt.title("%s / precision" % title)


# Fetching datasets ###########################################################
print("-- Fetching datasets ...")
from nilearn import datasets
msdl_atlas_dataset = datasets.fetch_msdl_atlas()
adhd_dataset = datasets.fetch_adhd(n_subjects=1)


# Extracting region signals ###################################################
import nilearn.image
import nilearn.input_data

from sklearn.externals.joblib import Memory
mem = Memory('nilearn_cache')

masker = nilearn.input_data.NiftiMapsMasker(
    msdl_atlas_dataset.maps, resampling_target="maps", detrend=True,
    low_pass=None, high_pass=0.01, t_r=2.5, standardize=True,
    memory=mem, memory_level=1, verbose=2)
masker.fit()

fmri_filename = adhd_dataset.func[0]
confound_filename = adhd_dataset.confounds[0]

# Computing some confounds
hv_confounds = mem.cache(nilearn.image.high_variance_confounds)(
    fmri_filename)

time_series = masker.transform(fmri_filename,
Example #27
0
Synopsis: Demo for coregistration in pure python

It demos coregistration on a variety of datasets including:
SPM single-subject auditory, NYU rest, ABIDE, etc.
"""

import os
import glob
import matplotlib.pyplot as plt
from pypreprocess.datasets import fetch_spm_auditory, fetch_nyu_rest
from pypreprocess.reporting.check_preprocessing import plot_registration
from pypreprocess.coreg import Coregister
from sklearn.externals.joblib import Memory

# misc
mem = Memory("demos_cache")


def _run_demo(func, anat):
    # fit
    coreg = Coregister().fit(anat, func)

    # apply coreg
    VFk = coreg.transform(func)

    # QA
    plot_registration(anat, VFk, title="before coreg")
    plot_registration(VFk, anat, title="after coreg")
    plt.show()

Example #28
0
    (redirects_url, redirects_filename),
    (page_links_url, page_links_filename),
]

for url, filename in resources:
    if not os.path.exists(filename):
        import urllib
        print "Downloading data from '%s', please wait..." % url
        opener = urllib.urlopen(url)
        open(filename, 'wb').write(opener.read())
        print

###############################################################################
# Loading the redirect files

memory = Memory(cachedir=".")


def index(redirects, index_map, k):
    """Find the index of an article name after redirect resolution"""
    k = redirects.get(k, k)
    return index_map.setdefault(k, len(index_map))


DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/")
SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)


def short_name(nt_uri):
    """Remove the < and > URI markers and the common URI prefix"""
    return nt_uri[SHORTNAME_SLICE]
def boo(subject_idx=0,
        cut_coords=None,
        n_components=20,
        n_clusters=2000,
        memory='nilearn_cache'):

    mem = Memory(cachedir='nilearn_cache')

    # ## Load the data ###################################################

    print("Fetch the data files from Internet")
    haxby_dataset = datasets.fetch_haxby(n_subjects=subject_idx + 1)

    print("Second, load the labels")
    haxby_labels = np.genfromtxt(haxby_dataset.session_target[0],
                                 skip_header=1,
                                 usecols=[0],
                                 dtype=basestring)

    # ## Find voxels of interest ##############################################

    print("Load the data.")
    anat_filename = haxby_dataset.anat[subject_idx]
    anat_img = nibabel.load(anat_filename)
    fmri_filename = haxby_dataset.func[subject_idx]
    fmri_raw_img = nibabel.load(fmri_filename)

    print("Build a mask based on the activations.")
    epi_masker = NiftiMasker(mask_strategy='epi',
                             detrend=True,
                             standardize=True)
    epi_masker = mem.cache(epi_masker.fit)(fmri_raw_img)
    plot_roi(epi_masker.mask_img_,
             bg_img=anat_img,
             title='EPI mask',
             cut_coords=cut_coords)

    print(
        "Normalize the (transformed) data")  # zscore per pixel, over examples.
    fmri_masked_vectors = epi_masker.transform(fmri_raw_img)
    fmri_normed_vectors = mem.cache(stats.mstats.zscore)(fmri_masked_vectors,
                                                         axis=0)
    fmri_normed_img = epi_masker.inverse_transform(fmri_normed_vectors)

    print("Smooth the (spatial) data.")
    fmri_smooth_img = mem.cache(image.smooth_img)(fmri_normed_img, fwhm=1)

    print("Mask the MRI data.")
    masked_fmri_vectors = mem.cache(epi_masker.transform)(fmri_smooth_img)
    fmri_masked_img = epi_masker.inverse_transform(masked_fmri_vectors)

    # ## Compute mean values based on condition matrix ##########################################
    condition_names = list(np.unique(haxby_labels))
    n_conditions = len(condition_names)
    n_good_voxels = masked_fmri_vectors.shape[1]

    mean_vectors = np.empty((n_conditions, n_good_voxels))
    for ci, condition in enumerate(condition_names):
        condition_vectors = masked_fmri_vectors[haxby_labels == condition, :]
        mean_vectors[ci, :] = condition_vectors.mean(axis=0)

    # ## Use similarity across conditions as the 4th dimension ##########################################
    n_conds = len(condition_names)
    n_compares = n_conds * (n_conds - 1) / 2

    p_vectors = np.zeros((n_compares, masked_fmri_vectors.shape[1]))
    comparison_text = []
    comparison_img = []
    idx = 0
    for i, cond in enumerate(condition_names):
        for j, cond2 in enumerate(condition_names[i + 1:]):
            print("Computing ttest for %s vs. %s." % (cond, cond2))
            _, p_vector = stats.ttest_ind(
                masked_fmri_vectors[haxby_labels == cond, :],
                masked_fmri_vectors[haxby_labels == cond2, :],
                axis=0)

            p_vector /= p_vector.max()  # normalize
            p_vector = -np.log10(p_vector)
            p_vector[np.isnan(p_vector)] = 0.
            p_vector[p_vector > 10.] = 10.

            p_img = epi_masker.inverse_transform(p_vector)
            comparison_img.append(p_img)
            comparison_text.append('%s vs. %s' % (cond, cond2))
            p_vectors[idx, :] = p_vector
            idx += 1

    #n_comparisons = n_conditions * (n_conditions-1) / 2
    #similarity_vectors = np.empty((n_good_voxels, n_comparisons))
    #for vi in np.arange(n_good_voxels):
    #    similarity_vectors[vi, :] = pdist(mean_vectors[:, vi])

    # Compute a connectivity matrix (for constraining the clustering)
    mask_data = epi_masker.mask_img_.get_data().astype(np.bool)
    connectivity = sk_image.grid_to_graph(n_x=mask_data.shape[0],
                                          n_y=mask_data.shape[1],
                                          n_z=mask_data.shape[2],
                                          mask=mask_data)

    # Cluster (#2)

    start = time.time()
    ward = WardAgglomeration(n_clusters=n_clusters,
                             connectivity=connectivity,
                             memory=memory)
    ward.fit(p_vectors)

    print("Ward agglomeration %d clusters: %.2fs" %
          (n_clusters, time.time() - start))

    # Compute an image with one ROI per label, and save to disk
    labels = ward.labels_ + 1  # Avoid 0 label - 0 means mask.
    labels_img = epi_masker.inverse_transform(labels)
    labels_img.to_filename('parcellation.nii')

    # Plot image with len(labels) ROIs, and store
    #   the cut coordinates to reuse for all plots
    #   and the figure for plotting all to a common axis
    first_plot = plot_roi(labels_img,
                          title="Ward parcellation",
                          bg_img=anat_img)
    plt.show()
Example #30
0
    def __init__(self,
                 n_components=20,
                 n_epochs=1,
                 alpha=0.,
                 dict_init=None,
                 random_state=None,
                 l1_ratio=1,
                 batch_size=20,
                 replacement=False,
                 reduction=1,
                 projection='partial',
                 learning_rate=1,
                 offset=0,
                 var_red='weight_based',
                 shelve=True,
                 mask=None,
                 smoothing_fwhm=None,
                 standardize=True,
                 detrend=True,
                 low_pass=None,
                 high_pass=None,
                 t_r=None,
                 target_affine=None,
                 target_shape=None,
                 mask_strategy='epi',
                 mask_args=None,
                 memory=Memory(cachedir=None),
                 memory_level=0,
                 n_jobs=1,
                 backend='python',
                 verbose=0,
                 trace_folder=None):
        BaseDecomposition.__init__(
            self,
            n_components=n_components,
            random_state=random_state,
            mask=mask,
            smoothing_fwhm=smoothing_fwhm,
            standardize=standardize,
            detrend=detrend,
            low_pass=low_pass,
            high_pass=high_pass,
            t_r=t_r,
            target_affine=target_affine,
            target_shape=target_shape,
            mask_strategy=mask_strategy,
            mask_args=mask_args,
            memory=memory,
            memory_level=memory_level,
            n_jobs=n_jobs,
            verbose=verbose,
        )
        self.l1_ratio = l1_ratio
        self.alpha = alpha
        self.dict_init = dict_init
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.reduction = reduction
        self.projection = projection
        self.var_red = var_red
        self.replacement = replacement

        self.backend = backend
        self.shelve = shelve
        self.trace_folder = trace_folder

        self.learning_rate = learning_rate
        self.offset = offset
Example #31
0
class SecondLevelModel(BaseEstimator, TransformerMixin, CacheMixin):
    """ Implementation of the General Linear Model for multiple subject
    fMRI data

    Parameters
    ----------

    mask_img: Niimg-like, NiftiMasker or MultiNiftiMasker object, optional,
        Mask to be used on data. If an instance of masker is passed,
        then its mask will be used. If no mask is given,
        it will be computed automatically by a MultiNiftiMasker with default
        parameters. Automatic mask computation assumes first level imgs have
        already been masked.

    smoothing_fwhm: float, optional
        If smoothing_fwhm is not None, it gives the size in millimeters of the
        spatial smoothing to apply to the signal.

    memory: string, optional
        Path to the directory used to cache the masking process and the glm
        fit. By default, no caching is done. Creates instance of joblib.Memory.

    memory_level: integer, optional
        Rough estimator of the amount of memory used by caching. Higher value
        means more memory for caching.

    verbose : integer, optional
        Indicate the level of verbosity. By default, nothing is printed.
        If 0 prints nothing. If 1 prints final computation time.
        If 2 prints masker computation details.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs', -2 'all CPUs but one', and so on.

    minimize_memory : boolean, optional
        Gets rid of some variables on the model fit results that are not
        necessary for contrast computation and would only be useful for
        further inspection of model details. This has an important impact
        on memory consumption. True by default.

    """
    @replace_parameters({'mask': 'mask_img'}, end_version='next')
    def __init__(self, mask_img=None, smoothing_fwhm=None,
                 memory=Memory(None), memory_level=1, verbose=0,
                 n_jobs=1, minimize_memory=True):
        self.mask_img = mask_img
        self.smoothing_fwhm = smoothing_fwhm
        if isinstance(memory, _basestring):
            self.memory = Memory(memory)
        else:
            self.memory = memory
        self.memory_level = memory_level
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.minimize_memory = minimize_memory
        self.second_level_input_ = None
        self.confounds_ = None

    def fit(self, second_level_input, confounds=None, design_matrix=None):
        """ Fit the second-level GLM

        1. create design matrix
        2. do a masker job: fMRI_data -> Y
        3. fit regression to (Y, X)

        Parameters
        ----------
        second_level_input: list of `FirstLevelModel` objects or pandas
                            DataFrame or list of Niimg-like objects.

            Giving FirstLevelModel objects will allow to easily compute
            the second level contast of arbitrary first level contrasts thanks
            to the first_level_contrast argument of the compute_contrast
            method. Effect size images will be computed for each model to
            contrast at the second level.

            If a pandas DataFrame, then they have to contain subject_label,
            map_name and effects_map_path. It can contain multiple maps that
            would be selected during contrast estimation with the argument
            first_level_contrast of the compute_contrast function. The
            DataFrame will be sorted based on the subject_label column to avoid
            order inconsistencies when extracting the maps. So the rows of the
            automatically computed design matrix, if not provided, will
            correspond to the sorted subject_label column.
 
            If list of Niimg-like objects then this is taken literally as Y
            for the model fit and design_matrix must be provided.

        confounds: pandas DataFrame, optional
            Must contain a subject_label column. All other columns are
            considered as confounds and included in the model. If
            design_matrix is provided then this argument is ignored.
            The resulting second level design matrix uses the same column
            names as in the given DataFrame for confounds. At least two columns
            are expected, "subject_label" and at least one confound.

        design_matrix: pandas DataFrame, optional
            Design matrix to fit the GLM. The number of rows
            in the design matrix must agree with the number of maps derived
            from second_level_input.
            Ensure that the order of maps given by a second_level_input
            list of Niimgs matches the order of the rows in the design matrix.

        """
        # Check parameters
        # check first level input
        if isinstance(second_level_input, list):
            if len(second_level_input) < 2:
                raise ValueError('A second level model requires a list with at'
                                 'least two first level models or niimgs')
            # Check FirstLevelModel objects case
            if isinstance(second_level_input[0], FirstLevelModel):
                models_input = enumerate(second_level_input)
                for model_idx, first_level_model in models_input:
                    if (first_level_model.labels_ is None or
                            first_level_model.results_ is None):
                        raise ValueError(
                            'Model %s at index %i has not been fit yet'
                            '' % (first_level_model.subject_label, model_idx))
                    if not isinstance(first_level_model, FirstLevelModel):
                        raise ValueError(' object at idx %d is %s instead of'
                                         ' FirstLevelModel object' %
                                         (model_idx, type(first_level_model)))
                    if confounds is not None:
                        if first_level_model.subject_label is None:
                            raise ValueError(
                                'In case confounds are provided, first level '
                                'objects need to provide the attribute '
                                'subject_label to match rows appropriately.'
                                'Model at idx %d does not provide it. '
                                'To set it, you can do '
                                'first_level_model.subject_label = "01"'
                                '' % (model_idx))
            # Check niimgs case
            elif isinstance(second_level_input[0], (str, Nifti1Image)):
                if design_matrix is None:
                    raise ValueError('List of niimgs as second_level_input'
                                     ' require a design matrix to be provided')
                for model_idx, niimg in enumerate(second_level_input):
                    if not isinstance(niimg, (str, Nifti1Image)):
                        raise ValueError(' object at idx %d is %s instead of'
                                         ' Niimg-like object' %
                                         (model_idx, type(niimg)))
        # Check pandas dataframe case
        elif isinstance(second_level_input, pd.DataFrame):
            for col in ['subject_label', 'map_name', 'effects_map_path']:
                if col not in second_level_input.columns:
                    raise ValueError('second_level_input DataFrame must have'
                                     ' columns subject_label, map_name and'
                                     ' effects_map_path')
            # Make sure subject_label contain strings
            second_level_columns = second_level_input.columns.tolist()
            labels_index = second_level_columns.index('subject_label')
            labels_dtype = second_level_input.dtypes[labels_index]
            if not isinstance(labels_dtype, np.object):
                raise ValueError('subject_label column must be of dtype '
                                 'object instead of dtype %s' % labels_dtype)
        elif isinstance(second_level_input, (str, Nifti1Image)):
            if design_matrix is None:
                raise ValueError('List of niimgs as second_level_input'
                                 ' require a design matrix to be provided')
            second_level_input = check_niimg(niimg=second_level_input,
                                             ensure_ndim=4) 
        else:
            raise ValueError('second_level_input must be a list of'
                             ' `FirstLevelModel` objects, a pandas DataFrame'
                             ' or a list Niimg-like objects. Instead %s '
                             'was provided' % type(second_level_input))

        # check confounds
        if confounds is not None:
            if not isinstance(confounds, pd.DataFrame):
                raise ValueError('confounds must be a pandas DataFrame')
            if 'subject_label' not in confounds.columns:
                raise ValueError('confounds DataFrame must contain column'
                                 '"subject_label"')
            if len(confounds.columns) < 2:
                raise ValueError('confounds should contain at least 2 columns'
                                 'one called "subject_label" and the other'
                                 'with a given confound')
            # Make sure subject_label contain strings
            labels_index = confounds.columns.tolist().index('subject_label')
            labels_dtype = confounds.dtypes[labels_index]
            if not isinstance(labels_dtype, np.object):
                raise ValueError('subject_label column must be of dtype '
                                 'object instead of dtype %s' % labels_dtype)

        # check design matrix
        if design_matrix is not None:
            if not isinstance(design_matrix, pd.DataFrame):
                raise ValueError('design matrix must be a pandas DataFrame')

        # sort a pandas dataframe by subject_label to avoid inconsistencies
        # with the design matrix row order when automatically extracting maps
        if isinstance(second_level_input, pd.DataFrame):
            columns = second_level_input.columns.tolist()
            column_index = columns.index('subject_label')
            sorted_matrix = sorted(
                second_level_input.values, key=lambda x: x[column_index])
            sorted_input = pd.DataFrame(sorted_matrix, columns=columns)
            second_level_input = sorted_input

        self.second_level_input_ = second_level_input
        self.confounds_ = confounds

        # Report progress
        t0 = time.time()
        if self.verbose > 0:
            sys.stderr.write("Fitting second level model. "
                             "Take a deep breath\r")

        # Select sample map for masker fit and get subjects_label for design
        if isinstance(second_level_input, pd.DataFrame):
            sample_map = second_level_input['effects_map_path'][0]
            labels = second_level_input['subject_label']
            subjects_label = labels.values.tolist()
        elif isinstance(second_level_input, Nifti1Image):
            sample_map = mean_img(second_level_input)
        elif isinstance(second_level_input[0], FirstLevelModel):
            sample_model = second_level_input[0]
            sample_condition = sample_model.design_matrices_[0].columns[0]
            sample_map = sample_model.compute_contrast(
                sample_condition, output_type='effect_size')
            labels = [model.subject_label for model in second_level_input]
            subjects_label = labels
        else:
            # In this case design matrix had to be provided
            sample_map = mean_img(second_level_input)

        # Create and set design matrix, if not given
        if design_matrix is None:
            design_matrix = make_second_level_design_matrix(subjects_label,
                                                            confounds)
        self.design_matrix_ = design_matrix

        # Learn the mask. Assume the first level imgs have been masked.
        if not isinstance(self.mask_img, NiftiMasker):
            self.masker_ = NiftiMasker(
                mask_img=self.mask_img, smoothing_fwhm=self.smoothing_fwhm,
                memory=self.memory, verbose=max(0, self.verbose - 1),
                memory_level=self.memory_level)
        else:
            self.masker_ = clone(self.mask_img)
            for param_name in ['smoothing_fwhm', 'memory', 'memory_level']:
                our_param = getattr(self, param_name)
                if our_param is None:
                    continue
                if getattr(self.masker_, param_name) is not None:
                    warn('Parameter %s of the masker overriden' % param_name)
                setattr(self.masker_, param_name, our_param)
        self.masker_.fit(sample_map)

        # Report progress
        if self.verbose > 0:
            sys.stderr.write("\nComputation of second level model done in "
                             "%i seconds\n" % (time.time() - t0))

        return self

    def compute_contrast(
            self, second_level_contrast=None, first_level_contrast=None,
            second_level_stat_type=None, output_type='z_score'):
        """Generate different outputs corresponding to
        the contrasts provided e.g. z_map, t_map, effects and variance.

        Parameters
        ----------
        second_level_contrast: str or array of shape (n_col), optional
            Where ``n_col`` is the number of columns of the design matrix,
            The string can be a formula compatible with the linear constraint
            of the Patsy library. Basically one can use the name of the
            conditions as they appear in the design matrix of
            the fitted model combined with operators /\*+- and numbers.
            Please check the patsy documentation for formula examples:
            http://patsy.readthedocs.io/en/latest/API-reference.html#patsy.DesignInfo.linear_constraint
            The default (None) is accepted if the design matrix has a single
            column, in which case the only possible contrast array([1]) is
            applied; when the design matrix has multiple columns, an error is
            raised.

        first_level_contrast: str or array of shape (n_col) with respect to
                              FirstLevelModel, optional
                              
            In case a list of FirstLevelModel was provided as
            second_level_input, we have to provide a contrast to apply to
            the first level models to get the corresponding list of images
            desired, that would be tested at the second level. In case a
            pandas DataFrame was provided as second_level_input this is the
            map name to extract from the pandas dataframe map_name column.
            It has to be a 't' contrast.

        second_level_stat_type: {'t', 'F'}, optional
            Type of the second level contrast

        output_type: str, optional
            Type of the output map. Can be 'z_score', 'stat', 'p_value',
            'effect_size' or 'effect_variance'

        Returns
        -------
        output_image: Nifti1Image
            The desired output image

        """
        if self.second_level_input_ is None:
            raise ValueError('The model has not been fit yet')

        # first_level_contrast check
        if isinstance(self.second_level_input_[0], FirstLevelModel):
            if first_level_contrast is None:
                raise ValueError('If second_level_input was a list of '
                                 'FirstLevelModel, then first_level_contrast '
                                 'is mandatory. It corresponds to the '
                                 'second_level_contrast argument of the '
                                 'compute_contrast method of FirstLevelModel')

        # check contrast definition
        if second_level_contrast is None:
            if self.design_matrix_.shape[1] == 1:
                second_level_contrast = np.ones([1])
            else:
                raise ValueError('No second-level contrast is specified.')
        if isinstance(second_level_contrast, np.ndarray):
            con_val = second_level_contrast
            if np.all(con_val == 0):
                raise ValueError('Contrast is null')
        else:
            design_info = DesignInfo(self.design_matrix_.columns.tolist())
            constraint = design_info.linear_constraint(second_level_contrast)
            con_val = constraint.coefs
        # check output type
        if isinstance(output_type, _basestring):
            if output_type not in ['z_score', 'stat', 'p_value', 'effect_size',
                                   'effect_variance']:
                raise ValueError(
                    'output_type must be one of "z_score", "stat"'
                    ', "p_value", "effect_size" or "effect_variance"')
        else:
            raise ValueError('output_type must be one of "z_score", "stat",'
                             ' "p_value", "effect_size" or "effect_variance"')

        # Get effect_maps appropriate for chosen contrast
        effect_maps = _infer_effect_maps(self.second_level_input_,
                                         first_level_contrast)
        # Check design matrix X and effect maps Y agree on number of rows
        if len(effect_maps) != self.design_matrix_.shape[0]:
            raise ValueError(
                'design_matrix does not match the number of maps considered. '
                '%i rows in design matrix do not match with %i maps' %
                (self.design_matrix_.shape[0], len(effect_maps)))

        # Fit an Ordinary Least Squares regression for parametric statistics
        Y = self.masker_.transform(effect_maps)
        if self.memory:
            mem_glm = self.memory.cache(run_glm, ignore=['n_jobs'])
        else:
            mem_glm = run_glm
        labels, results = mem_glm(Y, self.design_matrix_.values,
                                  n_jobs=self.n_jobs, noise_model='ols')
        # We save memory if inspecting model details is not necessary
        if self.minimize_memory:
            for key in results:
                results[key] = SimpleRegressionResults(results[key])
        self.labels_ = labels
        self.results_ = results

        # We compute contrast object
        if self.memory:
            mem_contrast = self.memory.cache(compute_contrast)
        else:
            mem_contrast = compute_contrast
        contrast = mem_contrast(self.labels_, self.results_, con_val,
                                second_level_stat_type)

        # We get desired output from contrast object
        estimate_ = getattr(contrast, output_type)()

        # Prepare the returned images
        output = self.masker_.inverse_transform(estimate_)
        contrast_name = str(con_val)
        output.header['descrip'] = (
            '%s of contrast %s' % (output_type, contrast_name))
        return output
aforementioned settings. In general, speed up is increasing as
the index size grows.
"""

from __future__ import division

import numpy as np
from tempfile import gettempdir
from time import time

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors.approximate import LSHForest
from sklearn.datasets import make_blobs
from sklearn.externals.joblib import Memory

m = Memory(cachedir=gettempdir())


@m.cache()
def make_data(n_samples, n_features, n_queries, random_state=0):
    """Create index and query data."""
    print('Generating random blob-ish data')
    X, _ = make_blobs(n_samples=n_samples + n_queries,
                      n_features=n_features,
                      centers=100,
                      shuffle=True,
                      random_state=random_state)

    # Keep the last samples as held out query vectors: note since we used
    # shuffle=True we have ensured that index and query vectors are
    # samples from the same distribution (a mixture of 100 gaussians in this
Example #33
0
"""

import os
from collections import namedtuple
import matplotlib.pyplot as plt
from sklearn.externals.joblib import Memory
from pypreprocess.realign import MRIMotionCorrection
from pypreprocess.reporting.check_preprocessing import (
    plot_spm_motion_parameters)
from pypreprocess.datasets import (fetch_fsl_feeds, fetch_spm_multimodal_fmri,
                                   fetch_spm_auditory)
from nilearn.datasets import fetch_nyu_rest

# data structure for subject data
SubjectData = namedtuple('SubjectData', 'subject_id func output_dir')
mem = Memory("demos_cache")


def _demo_runner(subjects, dataset_id, **spm_realign_kwargs):
    """Demo runner.

    Parameters
    ----------
    subjects: iterable for subject data
        each subject data can be anything, with a func (string or list
        of strings; existing file path(s)) and an output_dir (string,
        existing dir path) field
    dataset_id: string
        a short string describing the data being processed (e.g. "HAXBY!")

    Notes
Example #34
0
def boo(subject_idx=0, cut_coords=None):

    mem = Memory(cachedir='nilearn_cache')

    # ## Load the data ###################################################

    print("Fetch the data files from Internet")
    haxby_dataset = datasets.fetch_haxby(n_subjects=subject_idx + 1)

    print("Second, load the labels")
    haxby_labels = np.genfromtxt(haxby_dataset.session_target[0],
                                 skip_header=1, usecols=[0],
                                 dtype=basestring)

    # ## Find voxels of interest ##############################################

    print("Load the data.")
    anat_filename = haxby_dataset.anat[subject_idx]
    anat_img = nibabel.load(anat_filename)
    fmri_filename = haxby_dataset.func[subject_idx]
    fmri_raw_img = nibabel.load(fmri_filename)
    shared_affine = fmri_raw_img.get_affine()

    print("Build a mask based on the activations.")
    epi_masker = NiftiMasker(mask_strategy='epi', detrend=True, standardize=True)
    epi_masker = mem.cache(epi_masker.fit)(fmri_raw_img)
    plot_roi(epi_masker.mask_img_,
             title='EPI mask',
             cut_coords=cut_coords)

    from nipy.labs.viz import plot_map
#    plot_map(epi_masker.mask_img_.get_data(), epi_masker.mask_img_.get_affine())
#    plt.show()
#    exit()

    #print("Normalize the (transformed) data")  # zscore per pixel, over examples.
    #fmri_masked_vectors = epi_masker.transform(fmri_raw_img)
    #fmri_normed_vectors = mem.cache(stats.mstats.zscore)(fmri_masked_vectors, axis=0)
    fmri_normed_img = fmri_raw_img #epi_masker.inverse_transform(fmri_normed_vectors)

    print("Smooth the (spatial) data.")
    fmri_smooth_img = mem.cache(image.smooth_img)(fmri_normed_img, fwhm=1)

    print("Mask the MRI data.")
    masked_fmri_vectors = mem.cache(epi_masker.transform)(fmri_smooth_img)
    fmri_masked_img = epi_masker.inverse_transform(masked_fmri_vectors)

    # ## Compute a similarity matrix ##########################################

    condition_names = list(np.unique(haxby_labels))
    n_cond_img = (haxby_labels == condition_names[0]).sum()
    n_conds = len(condition_names)
    n_compares = n_conds * (n_conds - 1) / 2
    p_vectors = np.zeros((n_compares, masked_fmri_vectors.shape[1]))

    idx = 0
    for i, cond in enumerate(condition_names):
        for j, cond2 in enumerate(condition_names[i+1:]):
            print("Computing ttest for %s vs. %s." % (cond, cond2))
            _, p_vectors[idx, :] = stats.ttest_ind(
                masked_fmri_vectors[haxby_labels == cond, :],
                masked_fmri_vectors[haxby_labels == cond2, :],
                axis=0)
            idx += 1

    p_vectors_normd = p_vectors / p_vectors.max(axis=0)
    log_p_vectors = -np.log10(p_vectors)
    log_p_vectors[np.isnan(log_p_vectors)] = 0.
    log_p_vectors[log_p_vectors > 10.] = 10.
    #log_p_normd_vectors = log_p_vectors / log_p_vectors.sum(axis=0)
    plt.figure(); plt.hist(p_vectors_normd.max(axis=0), 100); plt.show()

    idx = 0
    for i, cond in enumerate(condition_names):
        for j, cond2 in enumerate(condition_names[i+1:]):
            if cond != 'face' and cond2 != 'face': continue

            print("Plotting compares for %s vs. %s." % (cond, cond2))
            log_p_img = epi_masker.inverse_transform(1/p_vectors[idx, :])
            log_p_normd_img = epi_masker.inverse_transform(1. - p_vectors_normd[idx, :])
            plot_two_maps(plot_stat_map,
                          (log_p_img, "%s vs. %s." % (cond, cond2)),
                          (log_p_normd_img, "%s vs. %s. (norm'd)"
                              % (cond, cond2)), bg_img=anat_img)
            import pdb; pdb.set_trace()
            plt.show()
            idx += 1
Example #35
0
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
            metric='minkowski', p=2, leaf_size=40,
            algorithm='best', memory=Memory(cachedir=None, verbose=0),
            approx_min_span_tree=True, gen_min_span_tree=False,
            core_dist_n_jobs=4, allow_single_cluster=False, **kwargs):

    """Perform HDBSCAN clustering from a vector array or distance matrix.
    
    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.
        
    min_cluster_size : int optional
        The minimum number of samples in a group for that group to be
        considered a cluster; groupings smaller than this size will be left
        as noise.

    min_samples : int, optional
        The number of samples in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
        defaults to the min_cluster_size.

    alpha : float, optional
        A distance scaling parameter as used in robust single linkage.
        See (K. Chaudhuri and S. Dasgupta  "Rates of convergence
        for the cluster tree."). (default 1.0)

    metric : string, or callable, optional
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.
        (default minkowski)

    p : int, optional
        p value to use if using the minkowski metric. (default 2)

    leaf_size : int, optional
        Leaf size for trees responsible for fast nearest
        neighbour queries. (default 40)

    algorithm : string, optional
        Exactly which algorithm to use; hdbscan has variants specialised 
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of 
        the data. You can force other options if you believe you know 
        better. Options are:
            * ``best``
            * ``generic``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    memory : Instance of joblib.Memory or string (optional)
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    approx_min_span_tree : Bool, optional
        Whether to accept an only approximate minimum spanning tree.
        For some algorithms this can provide a significant speedup, but
        the resulting clustering may be of marginally lower quality.
        If you are willing to sacrifice speed for correctness you may want
        to explore this; in general this should be left at the default True.
        (default True)

    gen_min_span_tree : bool, optional
        Whether to generate the minimum spanning tree for later analysis.
        (default False)

    core_dist_n_jobs : int, optional
        Number of parallel jobs to run in core distance computations (if
        supported by the specific algorithm).
        (default 4)


    allow_single_cluster : boolean
        By default HDBSCAN* will not produce a single cluster, setting this
        to t=True will override this and allow single cluster results in
        the case that you feel this is a valid result for your dataset.
        (default False)


    **kwargs : optional
        Arguments passed to the distance metric

    Returns
    -------
    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.

    probabilities : array [n_samples]
        Cluster membership strengths for each point. Noisy samples are assigned
        0.

    cluster_persistence : array, shape = [n_clusters]
        A score of how persistent each cluster is. A score of 1.0 represents
        a perfectly stable cluster that persists over all distance scales,
        while a score of 0.0 represents a perfectly ephemeral cluster. These
        scores can be guage the relative coherence of the clusters output
        by the algorithm.

    condensed_tree : record array
        The condensed cluster hierarchy used to generate clusters.

    single_linkage_tree : array [n_samples - 1, 4]
        The single linkage tree produced during clustering in scipy
        hierarchical clustering format
        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

    min_spanning_tree : array [n_samples - 1, 3]
        The minimum spanning as an edgelist. If gen_min_span_tree was False
        this will be None.

    References
    ----------
    R. Campello, D. Moulavi, and J. Sander, "Density-Based Clustering Based on
    Hierarchical Density Estimates"
    In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172.
    2013
    """
    if min_samples is None:
        min_samples = min_cluster_size

    if type(min_samples) is not int or type(min_cluster_size) is not int:
        raise ValueError('Min samples and min cluster size must be integers!')

    if min_samples <= 0 or min_cluster_size <= 0:
        raise ValueError('Min samples and Min cluster size must be positive integers')

    if alpha <= 0.0 or type(alpha) is int:
        raise ValueError('Alpha must be a positive value greater than 0!')

    if leaf_size < 1:
        raise ValueError('Leaf size must be greater than 0!')

    # Checks input and converts to an nd-array where possible
    X = check_array(X, accept_sparse='csr')
    # Python 2 and 3 compliant string_type checking
    if isinstance(memory, six.string_types):
        memory = Memory(cachedir=memory, verbose=0)

    if algorithm != 'best':
        if algorithm == 'generic':
            (single_linkage_tree, 
             result_min_span_tree) = \
                memory.cache(_hdbscan_generic)(X, min_samples, alpha, metric,
                                               p, leaf_size, gen_min_span_tree, **kwargs)
        elif algorithm == 'prims_kdtree':
            if metric not in KDTree.valid_metrics:
                raise ValueError("Cannot use Prim's with KDTree for this metric!")
            (single_linkage_tree, 
             result_min_span_tree) = \
                memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
                                                    metric, p, leaf_size,
                                                    gen_min_span_tree, **kwargs)
        elif algorithm == 'prims_balltree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Prim's with BallTree for this metric!")
            (single_linkage_tree, 
             result_min_span_tree) = \
                memory.cache(_hdbscan_prims_balltree)(X, min_samples, alpha,
                                                      metric, p, leaf_size,
                                                      gen_min_span_tree, **kwargs)
        elif algorithm == 'boruvka_kdtree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Boruvka with KDTree for this metric!")
            (single_linkage_tree,
             result_min_span_tree) = \
                memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha,
                                                      metric, p, leaf_size,
                                                      approx_min_span_tree,
                                                      gen_min_span_tree,
                                                      core_dist_n_jobs, **kwargs)
        elif algorithm == 'boruvka_balltree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Boruvka with BallTree for this metric!")
            (single_linkage_tree,
             result_min_span_tree) = \
                memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha,
                                                        metric, p, leaf_size,
                                                        approx_min_span_tree,
                                                        gen_min_span_tree,
                                                        core_dist_n_jobs, **kwargs)
        else:
            raise TypeError('Unknown algorithm type %s specified' % algorithm)
    else:

        if issparse(X) or metric not in FAST_METRICS:  # We can't do much with sparse matrices ...
            (single_linkage_tree,
             result_min_span_tree) = \
                memory.cache(_hdbscan_generic)(X, min_samples,
                                               alpha, metric, p, leaf_size,
                                               gen_min_span_tree, **kwargs)
        elif metric in KDTree.valid_metrics:
            #TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 60:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
                                                        metric, p, leaf_size,
                                                        gen_min_span_tree, **kwargs)
            else:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha,
                                                          metric, p, leaf_size,
                                                          approx_min_span_tree,
                                                          gen_min_span_tree,
                                                          core_dist_n_jobs, **kwargs)
        else:  # Metric is a valid BallTree metric
            # TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 60:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
                                                        metric, p, leaf_size,
                                                        gen_min_span_tree, **kwargs)
            else:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha,
                                                            metric, p, leaf_size,
                                                            approx_min_span_tree,
                                                            gen_min_span_tree,
                                                            core_dist_n_jobs, **kwargs)

    return _tree_to_labels(X,
                           single_linkage_tree,
                           min_cluster_size,
                           allow_single_cluster) + (result_min_span_tree,)
Example #36
0
def test_selectors(filepath='features_final.csv', scaler='minMax'):
    from sklearn.externals.joblib import Memory
    from tempfile import mkdtemp
    from shutil import rmtree

    X, y, genre_mapping = loadFeatures_NoSplit(filepath)
    X = normalize(X, scaler)

    pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())])

    N_FEATURES_OPTIONS = [2, 10, 20, 40, 60, 120]
    C_OPTIONS = [1, 10]
    param_grid = [
        {
            'reduce_dim': [PCA(iterated_power=7), NMF()],
            'reduce_dim__n_components': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
        {
            'reduce_dim': [SelectKBest(chi2)],
            'reduce_dim__k': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
    ]
    reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']

    # Create a temporary folder to store the transformers of the pipeline
    cachedir = mkdtemp()
    memory = Memory(cachedir=cachedir, verbose=10)
    cached_pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())],
                           memory=memory)

    # This time, a cached pipeline will be used within the grid search
    grid = GridSearchCV(cached_pipe, cv=2, n_jobs=1, param_grid=param_grid)
    #grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
    grid.fit(X, y)

    # Delete the temporary cache before exiting
    rmtree(cachedir)

    mean_scores = np.array(grid.cv_results_['mean_test_score'])
    # scores are in the order of param_grid iteration, which is alphabetical
    mean_scores = mean_scores.reshape(len(C_OPTIONS), -1,
                                      len(N_FEATURES_OPTIONS))
    # select score for best C
    mean_scores = mean_scores.max(axis=0)
    bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
                   (len(reducer_labels) + 1) + .5)

    plt.figure()
    COLORS = 'bgrcmyk'
    for i, (label,
            reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
        plt.bar(bar_offsets + i, reducer_scores, label=label, color=COLORS[i])

    plt.title("Comparing feature reduction techniques")
    plt.xlabel('Reduced number of features')
    plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
    plt.ylabel('Classification accuracy')
    plt.ylim((0, 1))
    plt.legend(loc='upper left')
    plt.show()
Example #37
0
"""Utility functions for autoreject."""

# Authors: Mainak Jas <*****@*****.**>
#          Denis A. Engemann <*****@*****.**>

from collections import defaultdict
import warnings

import mne
from mne.utils import check_version as version_is_greater_equal
from mne import pick_types, pick_channels, pick_info
from mne.channels.interpolation import _do_interp_dots

from sklearn.externals.joblib import Memory

mem = Memory(cachedir='cachedir')


def _get_ch_type_from_picks(picks, info):
    """Get the channel types from picks."""
    keys = list()
    for pp in picks:
        key = mne.io.pick.channel_type(info=info, idx=pp)
        if key not in keys:
            keys.append(key)
    return keys


def _check_data(epochs,
                picks,
                ch_constraint='data_channels',
Example #38
0
from sklearn.cluster import FeatureAgglomeration
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import Pipeline
from sklearn.externals.joblib import Memory
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

hyper_data = pd.read_csv("../Data/headers3mgperml.csv", sep=',')

X = hyper_data.values[:, 16:]
y = hyper_data.values[:, 2]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=75)

cv = KFold(2)
ridge = BayesianRidge()
cachedir = tempfile.mkdtemp()
mem = Memory(cachedir=cachedir, verbose=1)
#
connectivity = grid_to_graph(n_x=240, n_y=34)
ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem)
#
clf = Pipeline([('ward', ward), ('ridge', ridge)])
clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
clf.fit(X,y)
# coef_ = clf.best_estimator_.steps[-1][1].coef_
# coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
# coef_agglomeration_ = coef_.reshape(240, 34)

def evaluate_model(dataset, pipeline_components, pipeline_parameters):
    input_data = pd.read_csv(dataset, compression='gzip', sep='\t')
    features = input_data.drop('class', axis=1).values.astype(float)
    labels = input_data['class'].values

    pipelines = [
        dict(zip(pipeline_parameters.keys(), list(parameter_combination)))
        for parameter_combination in itertools.product(
            *pipeline_parameters.values())
    ]

    # Create a temporary folder to store the transformers of the pipeline
    cachedir = mkdtemp()
    memory = Memory(cachedir=cachedir, verbose=0)

    with warnings.catch_warnings():
        # Squash warning messages. Turn this off when debugging!
        warnings.simplefilter('ignore')

        for pipe_parameters in pipelines:
            pipeline = []
            for component in pipeline_components:
                if component in pipe_parameters:
                    args = pipe_parameters[component]
                    pipeline.append(component(**args))
                else:
                    pipeline.append(component())

            try:
                clf = make_pipeline(*pipeline, memory=memory)
                cv_predictions = cross_val_predict(estimator=clf,
                                                   X=features,
                                                   y=labels,
                                                   cv=StratifiedKFold(
                                                       n_splits=10,
                                                       shuffle=True,
                                                       random_state=90483257))
                accuracy = accuracy_score(labels, cv_predictions)
                macro_f1 = f1_score(labels, cv_predictions, average='macro')
                balanced_accuracy = balanced_accuracy_score(
                    labels, cv_predictions)
            except KeyboardInterrupt:
                sys.exit(1)
            # This is a catch-all to make sure that the evaluation won't crash due to a bad parameter
            # combination or bad data. Turn this off when debugging!
            except Exception as e:
                continue

            preprocessor_class = pipeline_components[0]

            preprocessor_param_string = 'default'

            if preprocessor_class in pipe_parameters:
                preprocessor_param_string = ','.join([
                    '{}={}'.format(
                        parameter,
                        '|'.join([x.strip() for x in str(value).split(',')]))
                    for parameter, value in
                    pipe_parameters[preprocessor_class].items()
                ])

            classifier_class = pipeline_components[-1]
            param_string = ','.join([
                '{}={}'.format(parameter, value) for parameter, value in
                pipe_parameters[classifier_class].items()
            ])

            out_text = '\t'.join([
                dataset.split('/')[-1][:-7], preprocessor_class.__name__,
                preprocessor_param_string, classifier_class.__name__,
                param_string,
                str(accuracy),
                str(macro_f1),
                str(balanced_accuracy)
            ])

            print(out_text)
            sys.stdout.flush()

    # Delete the temporary cache before exiting
    rmtree(cachedir)
msdl_atlas_dataset = datasets.fetch_atlas_msdl()
adhd_dataset = datasets.fetch_adhd(n_subjects=n_subjects)

# print basic information on the dataset
print('First subject functional nifti image (4D) is at: %s' %
      adhd_dataset.func[0])  # 4D data

##############################################################################
# Extracting region signals
# --------------------------
from nilearn import image
from nilearn import input_data

# A "memory" to avoid recomputation
from sklearn.externals.joblib import Memory
mem = Memory('nilearn_cache')

masker = input_data.NiftiMapsMasker(msdl_atlas_dataset.maps,
                                    resampling_target="maps",
                                    detrend=True,
                                    low_pass=None,
                                    high_pass=0.01,
                                    t_r=2.5,
                                    standardize=True,
                                    memory='nilearn_cache',
                                    memory_level=1,
                                    verbose=2)
masker.fit()

subject_time_series = []
func_filenames = adhd_dataset.func
Example #41
0
def test_pipeline_memory_sampler():
    X, y = make_classification(n_classes=2,
                               class_sep=2,
                               weights=[0.1, 0.9],
                               n_informative=3,
                               n_redundant=1,
                               flip_y=0,
                               n_features=20,
                               n_clusters_per_class=1,
                               n_samples=5000,
                               random_state=0)
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert_false(hasattr(transf, 'means_'))
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_)
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_)
    finally:
        shutil.rmtree(cachedir)
Example #42
0
def mask_and_reduce(masker,
                    imgs,
                    confounds=None,
                    reduction_ratio='auto',
                    n_components=None,
                    random_state=None,
                    memory_level=0,
                    memory=Memory(cachedir=None),
                    n_jobs=1):
    """Mask and reduce provided 4D images with given masker.

    Uses a PCA (randomized for small reduction ratio) or a range finding matrix
    on time series to reduce data size in time direction. For multiple images,
    the concatenation of data is returned, either as an ndarray or a memorymap
    (useful for big datasets that do not fit in memory).

    Parameters
    ----------
    masker: NiftiMasker or MultiNiftiMasker
        Instance used to mask provided data.

    imgs: list of 4D Niimg-like objects
        See http://nilearn.github.io/manipulating_images/input_output.html
        List of subject data to mask, reduce and stack.

    confounds: CSV file path or 2D matrix, optional
        This parameter is passed to signal.clean. Please see the
        corresponding documentation for details.

    reduction_ratio: 'auto' or float between 0. and 1.
        - Between 0. or 1. : controls data reduction in the temporal domain
        , 1. means no reduction, < 1. calls for an SVD based reduction.
        - if set to 'auto', estimator will set the number of components per
          reduced session to be n_components.

    n_components: integer, optional
        Number of components per subject to be extracted by dimension reduction

    random_state: int or RandomState
        Pseudo number generator state used for random sampling.

    memory_level: integer, optional
        Integer indicating the level of memorization. The higher, the more
        function calls are cached.

    memory: joblib.Memory
        Used to cache the function calls.

    Returns
    ------
    data: ndarray or memorymap
        Concatenation of reduced data.
    """
    if not hasattr(imgs, '__iter__'):
        imgs = [imgs]

    if reduction_ratio == 'auto':
        if n_components is None:
            # Reduction ratio is 1 if
            # neither n_components nor ratio is provided
            reduction_ratio = 1
    else:
        if reduction_ratio is None:
            reduction_ratio = 1
        else:
            reduction_ratio = float(reduction_ratio)
        if not 0 <= reduction_ratio <= 1:
            raise ValueError('Reduction ratio should be between 0. and 1.,'
                             'got %.2f' % reduction_ratio)

    if confounds is None:
        confounds = itertools.repeat(confounds)

    if reduction_ratio == 'auto':
        n_samples = n_components
        reduction_ratio = None
    else:
        # We'll let _mask_and_reduce_single decide on the number of
        # samples based on the reduction_ratio
        n_samples = None

    data_list = Parallel(n_jobs=n_jobs)(
        delayed(_mask_and_reduce_single)(masker,
                                         img,
                                         confound,
                                         reduction_ratio=reduction_ratio,
                                         n_samples=n_samples,
                                         memory=memory,
                                         memory_level=memory_level,
                                         random_state=random_state)
        for img, confound in zip(imgs, confounds))

    subject_n_samples = [subject_data.shape[0] for subject_data in data_list]

    n_samples = np.sum(subject_n_samples)
    n_voxels = int(np.sum(_safe_get_data(masker.mask_img_)))
    data = np.empty((n_samples, n_voxels), order='F', dtype='float64')

    current_position = 0
    for i, next_position in enumerate(np.cumsum(subject_n_samples)):
        data[current_position:next_position] = data_list[i]
        current_position = next_position
        # Clear memory as fast as possible: remove the reference on
        # the corresponding block of data
        data_list[i] = None
    return data
Example #43
0
                             depth=4,
                             num_round=100)  #good!
        y_pred = clf.multi(X_train_cv,
                           y_train_cv,
                           X_test_cv,
                           3,
                           y_test=y_test_cv)
        xx.append(multiclass_log_loss(y_test_cv, y_pred))
        print xx[-1]  #,y_pred.shape,zz[-1]
        ypred[test_index] = y_pred
    print xx
    print 'average:', np.mean(xx), 'std', np.std(xx)
    return ypred, np.mean(xx)


mem = Memory("./mycache")


@mem.cache
def get_data(name):
    data = load_svmlight_file(name)
    return data[0], data[1]


X, _ = get_data('../sparse/rebuild1.svm')
X1, _ = get_data('../sparse/rebuild2.svm')
X2, _ = get_data('../sparse/rebuild3.svm')
X3, _ = get_data('../sparse/rebuild4.svm')
X4, _ = get_data('../sparse/rebuild5.svm')
X = sparse.hstack([X, X1, X2, X3, X4], format='csr').todense()
train = pd.read_csv('../explore/train1.csv')
Example #44
0
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
            metric='minkowski', p=2, leaf_size=40,
            algorithm='best', memory=Memory(cachedir=None, verbose=0),
            approx_min_span_tree=True, gen_min_span_tree=False,
            core_dist_n_jobs=4, **kwargs):

    """Perform HDBSCAN clustering from a vector array or distance matrix.
    
    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.
        
    min_cluster_size : int optional
        The minimum number of samples in a group for that group to be
        considered a cluster; groupings smaller than this size will be left
        as noise.

    min_samples : int, optional
        The number of samples in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
        defaults to the min_cluster_size.

    alpha : float, optional
        A distance scaling parameter as used in robust single linkage.
        See (K. Chaudhuri and S. Dasgupta  "Rates of convergence
        for the cluster tree."). (default 1.0)

    metric : string, or callable, optional
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.
        (default minkowski)

    p : int, optional
        p value to use if using the minkowski metric. (default 2)

    leaf_size : int, optional
        Leaf size for trees responsible for fast nearest
        neighbour queries. (default 40)

    algorithm : string, optional
        Exactly which algorithm to use; hdbscan has variants specialised 
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of 
        the data. You can force other options if you believe you know 
        better. Options are:
            * ``best``
            * ``generic``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    memory : Instance of joblib.Memory or string (optional)
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    approx_min_span_tree : Bool, optional
        Whether to accept an only approximate minimum spanning tree.
        For some algorithms this can provide a significant speedup, but
        the resulting clustering may be of marginally lower quality.
        If you are willing to sacrifice speed for correctness you may want
        to explore this; in general this should be left at the default True.
        (default True)

    gen_min_span_tree : bool, optional
        Whether to generate the minimum spanning tree for later analysis.
        (default False)

    core_dist_n_jobs : int, optional
        Number of parallel jobs to run in core distance computations (if
        supported by the specific algorithm).
        (default 4)

    **kwargs : optional
        Arguments passed to the distance metric

    Returns
    -------
    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.

    probabilities : array [n_samples]
        Cluster membership strengths for each point. Noisy samples are assigned
        0.

    cluster_persistence : array, shape = [n_clusters]
        A score of how persistent each cluster is. A score of 1.0 represents
        a perfectly stable cluster that persists over all distance scales,
        while a score of 0.0 represents a perfectly ephemeral cluster. These
        scores can be guage the relative coherence of the clusters output
        by the algorithm.

    condensed_tree : record array
        The condensed cluster hierarchy used to generate clusters.

    single_linkage_tree : array [n_samples - 1, 4]
        The single linkage tree produced during clustering in scipy
        hierarchical clustering format
        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

    min_spanning_tree : array [n_samples - 1, 3]
        The minimum spanning as an edgelist. If gen_min_span_tree was False
        this will be None.
        
    References
    ----------
    R. Campello, D. Moulavi, and J. Sander, "Density-Based Clustering Based on
    Hierarchical Density Estimates"
    In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172.
    2013
    """
    if min_samples is None:
        min_samples = min_cluster_size

    if type(min_samples) is not int or type(min_cluster_size) is not int:
        raise ValueError('Min samples and min cluster size must be integers!')

    if min_samples <= 0 or min_cluster_size <= 0:
        raise ValueError('Min samples and Min cluster size must be positive integers')

    if alpha <= 0.0 or type(alpha) is int:
        raise ValueError('Alpha must be a positive value greater than 0!')

    if leaf_size < 1:
        raise ValueError('Leaf size must be greater than 0!')

    #Checks input and converts to an nd-array where possible
    X = check_array(X, accept_sparse='csr')
    #Python 2 and 3 compliant string_type checking
    if isinstance(memory, six.string_types):
        memory = Memory(cachedir=memory, verbose=0)

    if algorithm != 'best':
        if algorithm == 'generic':
            (single_linkage_tree, 
             result_min_span_tree) = \
                memory.cache(_hdbscan_generic)(X, min_samples, alpha, metric,
                                               p, leaf_size, gen_min_span_tree, **kwargs)
        elif algorithm == 'prims_kdtree':
            if metric not in KDTree.valid_metrics:
                raise ValueError("Cannot use Prim's with KDTree for this metric!")
            (single_linkage_tree, 
             result_min_span_tree) = \
                memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
                                                    metric, p, leaf_size,
                                                    gen_min_span_tree, **kwargs)
        elif algorithm == 'prims_balltree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Prim's with BallTree for this metric!")
            (single_linkage_tree, 
             result_min_span_tree) = \
                memory.cache(_hdbscan_prims_balltree)(X, min_samples, alpha,
                                                      metric, p, leaf_size,
                                                      gen_min_span_tree, **kwargs)
        elif algorithm == 'boruvka_kdtree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Boruvka with KDTree for this metric!")
            (single_linkage_tree,
             result_min_span_tree) = \
                memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha,
                                                      metric, p, leaf_size,
                                                      approx_min_span_tree,
                                                      gen_min_span_tree,
                                                      core_dist_n_jobs, **kwargs)
        elif algorithm == 'boruvka_balltree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Boruvka with BallTree for this metric!")
            (single_linkage_tree,
             result_min_span_tree) = \
                memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha,
                                                        metric, p, leaf_size,
                                                        approx_min_span_tree,
                                                        gen_min_span_tree,
                                                        core_dist_n_jobs, **kwargs)
        else:
            raise TypeError('Unknown algorithm type %s specified' % algorithm)
    else:

        if issparse(X) or metric not in FAST_METRICS:  # We can't do much with sparse matrices ...
            (single_linkage_tree,
             result_min_span_tree) = \
                memory.cache(_hdbscan_generic)(X, min_samples,
                                               alpha, metric, p, leaf_size,
                                               gen_min_span_tree, **kwargs)
        elif metric in KDTree.valid_metrics:
            #TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 60:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
                                                        metric, p, leaf_size,
                                                        gen_min_span_tree, **kwargs)
            else:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_boruvka_kdtree)(X, min_samples, alpha,
                                                          metric, p, leaf_size,
                                                          approx_min_span_tree,
                                                          gen_min_span_tree,
                                                          core_dist_n_jobs, **kwargs)
        else:  # Metric is a valid BallTree metric
            #TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 60:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
                                                        metric, p, leaf_size,
                                                        gen_min_span_tree, **kwargs)
            else:
                (single_linkage_tree,
                 result_min_span_tree) = \
                    memory.cache(_hdbscan_boruvka_balltree)(X, min_samples, alpha,
                                                            metric, p, leaf_size,
                                                            approx_min_span_tree,
                                                            gen_min_span_tree,
                                                            core_dist_n_jobs, **kwargs)

    return _tree_to_labels(X, single_linkage_tree, min_cluster_size) + (result_min_span_tree,)
Example #45
0
def first_level_models_from_bids(dataset_path,
                                 task_label,
                                 space_label,
                                 img_filters=None,
                                 t_r=None,
                                 slice_time_ref=0.,
                                 hrf_model='glover',
                                 drift_model='cosine',
                                 period_cut=128,
                                 drift_order=1,
                                 fir_delays=[0],
                                 min_onset=-24,
                                 mask=None,
                                 target_affine=None,
                                 target_shape=None,
                                 smoothing_fwhm=None,
                                 memory=Memory(None),
                                 memory_level=1,
                                 standardize=False,
                                 signal_scaling=0,
                                 noise_model='ar1',
                                 verbose=0,
                                 n_jobs=1,
                                 minimize_memory=True,
                                 derivatives_folder='derivatives'):
    """Create FirstLevelModel objects and fit arguments from a BIDS dataset.

    It t_r is not specified this function will attempt to load it from a
    bold.json file alongside slice_time_ref. Otherwise t_r and slice_time_ref
    are taken as given.

    Parameters
    ----------
    dataset_path: str
        Directory of the highest level folder of the BIDS dataset. Should
        contain subject folders and a derivatives folder.

    task_label: str
        Task_label as specified in the file names like _task-<task_label>_.

    space_label: str, optional
        Specifies the space label of the preproc.nii images.
        As they are specified in the file names like _space-<space_label>_.

    img_filters: list of tuples (str, str), optional (default: None)
        Filters are of the form (field, label). Only one filter per field
        allowed. A file that does not match a filter will be discarded.
        Possible filters are 'acq', 'rec', 'run', 'res' and 'variant'.
        Filter examples would be (variant, smooth), (acq, pa) and
        (res, 1x1x1).

    derivatives_folder: str, optional
        derivatives and app folder path containing preprocessed files.
        Like "derivatives/FMRIPREP". default is simply "derivatives".

    All other parameters correspond to a `FirstLevelModel` object, which
    contains their documentation. The subject label of the model will be
    determined directly from the BIDS dataset.

    Returns
    -------
    models: list of `FirstLevelModel` objects
        Each FirstLevelModel object corresponds to a subject. All runs from
        different sessions are considered together for the same subject to run
        a fixed effects analysis on them.

    models_run_imgs: list of list of Niimg-like objects,
        Items for the FirstLevelModel fit function of their respective model.

    models_events: list of list of pandas DataFrames,
        Items for the FirstLevelModel fit function of their respective model.

    models_confounds: list of list of pandas DataFrames or None,
        Items for the FirstLevelModel fit function of their respective model.
    """
    # check arguments
    img_filters = img_filters if img_filters else []
    if not isinstance(dataset_path, str):
        raise TypeError('dataset_path must be a string, instead %s was given' %
                        type(task_label))
    if not os.path.exists(dataset_path):
        raise ValueError('given path do not exist: %s' % dataset_path)
    if not isinstance(task_label, str):
        raise TypeError('task_label must be a string, instead %s was given' %
                        type(task_label))
    if not isinstance(space_label, str):
        raise TypeError('space_label must be a string, instead %s was given' %
                        type(space_label))
    if not isinstance(img_filters, list):
        raise TypeError('img_filters must be a list, instead %s was given' %
                        type(img_filters))
    for img_filter in img_filters:
        if (not isinstance(img_filter[0], str)
                or not isinstance(img_filter[1], str)):
            raise TypeError('filters in img filters must be (str, str), '
                            'instead %s was given' % type(img_filter))
        if img_filter[0] not in ['acq', 'rec', 'run', 'res', 'variant']:
            raise ValueError("field %s is not a possible filter. Only "
                             "'acq', 'rec', 'run', 'res' and 'variant' "
                             "are allowed." % type(img_filter[0]))

    # check derivatives folder is present
    derivatives_path = os.path.join(dataset_path, derivatives_folder)
    if not os.path.exists(derivatives_path):
        raise ValueError('derivatives folder does not exist in given dataset')

    # Get acq specs for models. RepetitionTime and SliceTimingReference.
    # Throw warning if no bold.json is found
    if t_r is not None:
        warn('RepetitionTime given in model_init as %d' % t_r)
        warn('slice_time_ref is %d percent of the repetition '
             'time' % slice_time_ref)
    else:
        filters = [('task', task_label)]
        for img_filter in img_filters:
            if img_filter[0] in ['acq', 'rec', 'run']:
                filters.append(img_filter)

        img_specs = get_bids_files(derivatives_path,
                                   modality_folder='func',
                                   file_tag='preproc',
                                   file_type='json',
                                   filters=filters)
        # If we dont find the parameter information in the derivatives folder
        # we try to search in the raw data folder
        if not img_specs:
            img_specs = get_bids_files(dataset_path,
                                       modality_folder='func',
                                       file_tag='bold',
                                       file_type='json',
                                       filters=filters)
        if not img_specs:
            warn('No preproc.json found in derivatives folder and no bold.json'
                 ' in dataset folder. t_r can not be inferred and will need to'
                 ' be set manually in the list of models, otherwise their fit '
                 'will throw an exception')
        else:
            specs = json.load(open(img_specs[0], 'r'))
            if 'RepetitionTime' in specs:
                t_r = float(specs['RepetitionTime'])
            else:
                warn('RepetitionTime not found in file %s. t_r can not be '
                     'inferred and will need to be set manually in the '
                     'list of models. Otherwise their fit will throw an '
                     ' exception' % img_specs[0])
            if 'SliceTimingRef' in specs:
                slice_time_ref = float(specs['SliceTimingRef'])
            else:
                warn('SliceTimingRef not found in file %s. It will be assumed'
                     ' that the slice timing reference is 0.0 percent of the '
                     'repetition time. If it is not the case it will need to '
                     'be set manually in the generated list of models' %
                     img_specs[0])

    # Infer subjects in dataset
    sub_folders = glob.glob(os.path.join(derivatives_path, 'sub-*/'))
    sub_labels = [os.path.basename(s[:-1]).split('-')[1] for s in sub_folders]
    sub_labels = sorted(list(set(sub_labels)))

    # Build fit_kwargs dictionaries to pass to their respective models fit
    # Events and confounds files must match number of imgs (runs)
    models = []
    models_run_imgs = []
    models_events = []
    models_confounds = []
    for sub_label in sub_labels:
        # Create model
        model = FirstLevelModel(t_r=t_r,
                                slice_time_ref=slice_time_ref,
                                hrf_model=hrf_model,
                                drift_model=drift_model,
                                period_cut=period_cut,
                                drift_order=drift_order,
                                fir_delays=fir_delays,
                                min_onset=min_onset,
                                mask=mask,
                                target_affine=target_affine,
                                target_shape=target_shape,
                                smoothing_fwhm=smoothing_fwhm,
                                memory=memory,
                                memory_level=memory_level,
                                standardize=standardize,
                                signal_scaling=signal_scaling,
                                noise_model=noise_model,
                                verbose=verbose,
                                n_jobs=n_jobs,
                                minimize_memory=minimize_memory,
                                subject_label=sub_label)
        models.append(model)

        # Get preprocessed imgs
        filters = [('task', task_label), ('space', space_label)] + img_filters
        imgs = get_bids_files(derivatives_path,
                              modality_folder='func',
                              file_tag='preproc',
                              file_type='nii*',
                              sub_label=sub_label,
                              filters=filters)
        # If there is more than one file for the same (ses, run), likely we
        # have an issue of underspecification of filters.
        run_check_list = []
        # If more than one run is present the run field is mandatory in BIDS
        # as well as the ses field if more than one session is present.
        if len(imgs) > 1:
            for img in imgs:
                img_dict = parse_bids_filename(img)
                if ('_ses-' in img_dict['file_basename']
                        and '_run-' in img_dict['file_basename']):
                    if (img_dict['ses'], img_dict['run']) in run_check_list:
                        raise ValueError(
                            'More than one nifti image found for the same run '
                            '%s and session %s. Please verify that the '
                            'preproc_variant and space_label labels '
                            'corresponding to the BIDS spec '
                            'were correctly specified.' %
                            (img_dict['run'], img_dict['ses']))
                    else:
                        run_check_list.append(
                            (img_dict['ses'], img_dict['run']))

                elif '_ses-' in img_dict['file_basename']:
                    if img_dict['ses'] in run_check_list:
                        raise ValueError(
                            'More than one nifti image found for the same ses '
                            '%s, while no additional run specification present'
                            '. Please verify that the preproc_variant and '
                            'space_label labels '
                            'corresponding to the BIDS spec '
                            'were correctly specified.' % img_dict['ses'])
                    else:
                        run_check_list.append(img_dict['ses'])

                elif '_run-' in img_dict['file_basename']:
                    if img_dict['run'] in run_check_list:
                        raise ValueError(
                            'More than one nifti image found for the same run '
                            '%s. Please verify that the preproc_variant and '
                            'space_label labels '
                            'corresponding to the BIDS spec '
                            'were correctly specified.' % img_dict['run'])
                    else:
                        run_check_list.append(img_dict['run'])
        models_run_imgs.append(imgs)

        # Get events and extra confounds
        filters = [('task', task_label)]
        for img_filter in img_filters:
            if img_filter[0] in ['acq', 'rec', 'run']:
                filters.append(img_filter)

        # Get events files
        events = get_bids_files(dataset_path,
                                modality_folder='func',
                                file_tag='events',
                                file_type='tsv',
                                sub_label=sub_label,
                                filters=filters)
        if events:
            if len(events) != len(imgs):
                raise ValueError('%d events.tsv files found for %d bold '
                                 'files. Same number of event files as '
                                 'the number of runs is expected' %
                                 (len(events), len(imgs)))
            events = [
                pd.read_csv(event, sep='\t', index_col=None)
                for event in events
            ]
            models_events.append(events)
        else:
            raise ValueError('No events.tsv files found')

        # Get confounds. If not found it will be assumed there are none.
        # If there are confounds, they are assumed to be present for all runs.
        confounds = get_bids_files(derivatives_path,
                                   modality_folder='func',
                                   file_tag='confounds',
                                   file_type='tsv',
                                   sub_label=sub_label,
                                   filters=filters)

        if confounds:
            if len(confounds) != len(imgs):
                raise ValueError('%d confounds.tsv files found for %d bold '
                                 'files. Same number of confound files as '
                                 'the number of runs is expected' %
                                 (len(events), len(imgs)))
            confounds = [
                pd.read_csv(c, sep='\t', index_col=None) for c in confounds
            ]
            models_confounds.append(confounds)

    return models, models_run_imgs, models_events, models_confounds
    print('RMSE %s: %.3f' % (estimator, score))

    if output_dir is not None:
        with open(join(debug_folder, 'score'), 'w+') as f:
            f.write('score : %.4f' % score)

    return score


output_dir = expanduser(join('~/output/dl_recommender/',
                             datetime.datetime.now().strftime('%Y-%m-%d_%H'
                                                              '-%M-%S')))
os.makedirs(output_dir)

random_state = check_random_state(0)
mem = Memory(cachedir=expanduser("~/cache"), verbose=10)
X_csr = mem.cache(fetch_ml_10m)(expanduser('~/data/own/ml-10M100K'),
                               remove_empty=True)

permutation = random_state.permutation(X_csr.shape[0])

X_csr = X_csr[permutation]

X, y = array_to_fm_format(X_csr)

uniform_split = ShuffleSplit(n_iter=4,
                             test_size=.25, random_state=random_state)

fm_decoder = FMDecoder(n_samples=X_csr.shape[0], n_features=X_csr.shape[1])

base_estimator = BaseRecommender(fm_decoder)
Example #47
0
class FirstLevelModel(BaseEstimator, TransformerMixin, CacheMixin):
    """ Implementation of the General Linear Model for single session fMRI data

    Parameters
    ----------
    t_r : float
        This parameter indicates repetition times of the experimental runs.
        In seconds. It is necessary to correctly consider times in the design
        matrix. This parameter is also passed to nilearn.signal.clean.
        Please see the related documentation for details.

    slice_time_ref : float, optional (default 0.)
        This parameter indicates the time of the reference slice used in the
        slice timing preprocessing step of the experimental runs. It is
        expressed as a percentage of the t_r (time repetition), so it can have
        values between 0. and 1.

    hrf_model : {'spm', 'spm + derivative', 'spm + derivative + dispersion',
        'glover', 'glover + derivative', 'glover + derivative + dispersion',
        'fir', None}
        String that specifies the hemodynamic response function. Defaults to 'glover'.

    drift_model : string, optional
        This parameter specifies the desired drift model for the design
        matrices. It can be 'polynomial', 'cosine' or None.

    period_cut : float, optional
        This parameter specifies the cut period of the low-pass filter in
        seconds for the design matrices.

    drift_order : int, optional
        This parameter specifices the order of the drift model (in case it is
        polynomial) for the design matrices.

    fir_delays : array of shape(n_onsets) or list, optional
        In case of FIR design, yields the array of delays used in the FIR
        model, in seconds.

    min_onset : float, optional
        This parameter specifies the minimal onset relative to the design
        (in seconds). Events that start before (slice_time_ref * t_r +
        min_onset) are not considered.

    mask : Niimg-like, NiftiMasker object or False, optional
        Mask to be used on data. If an instance of masker is passed,
        then its mask will be used. If no mask is given,
        it will be computed automatically by a NiftiMasker with default
        parameters. If False is given then the data will not be masked.

    target_affine : 3x3 or 4x4 matrix, optional
        This parameter is passed to nilearn.image.resample_img. Please see the
        related documentation for details.

    target_shape : 3-tuple of integers, optional
        This parameter is passed to nilearn.image.resample_img. Please see the
        related documentation for details.

    smoothing_fwhm : float, optional
        If smoothing_fwhm is not None, it gives the size in millimeters of the
        spatial smoothing to apply to the signal.

    memory : string, optional
        Path to the directory used to cache the masking process and the glm
        fit. By default, no caching is done. Creates instance of joblib.Memory.

    memory_level : integer, optional
        Rough estimator of the amount of memory used by caching. Higher value
        means more memory for caching.

    standardize : boolean, optional
        If standardize is True, the time-series are centered and normed:
        their variance is put to 1 in the time dimension.

    signal_scaling : False, int or (int, int), optional,
        If not False, fMRI signals are scaled to the mean value of scaling_axis
        given, which can be 0, 1 or (0, 1). 0 refers to mean scaling each voxel
        with respect to time, 1 refers to mean scaling each time point with
        respect to all voxels and (0, 1) refers to scaling with respect to
        voxels and time, which is known as grand mean scaling.
        Incompatible with standardize (standardize=False is enforced when
        signal_scaling is not False).

    noise_model : {'ar1', 'ols'}, optional
        The temporal variance model. Defaults to 'ar1'

    verbose : integer, optional
        Indicate the level of verbosity. By default, nothing is printed.
        If 0 prints nothing. If 1 prints progress by computation of
        each run. If 2 prints timing details of masker and GLM. If 3
        prints masker computation details.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs', -2 'all CPUs but one', and so on.

    minimize_memory : boolean, optional
        Gets rid of some variables on the model fit results that are not
        necessary for contrast computation and would only be useful for
        further inspection of model details. This has an important impact
        on memory consumption. True by default.

    subject_label : string, optional
        This id will be used to identify a `FirstLevelModel` when passed to
        a `SecondLevelModel` object.

    Attributes
    ----------
    labels : array of shape (n_voxels,),
        a map of values on voxels used to identify the corresponding model

    results : dict,
        with keys corresponding to the different labels values
        values are RegressionResults instances corresponding to the voxels

    """
    def __init__(self,
                 t_r=None,
                 slice_time_ref=0.,
                 hrf_model='glover',
                 drift_model='cosine',
                 period_cut=128,
                 drift_order=1,
                 fir_delays=[0],
                 min_onset=-24,
                 mask=None,
                 target_affine=None,
                 target_shape=None,
                 smoothing_fwhm=None,
                 memory=Memory(None),
                 memory_level=1,
                 standardize=False,
                 signal_scaling=0,
                 noise_model='ar1',
                 verbose=0,
                 n_jobs=1,
                 minimize_memory=True,
                 subject_label=None):
        # design matrix parameters
        self.t_r = t_r
        self.slice_time_ref = slice_time_ref
        self.hrf_model = hrf_model
        self.drift_model = drift_model
        self.period_cut = period_cut
        self.drift_order = drift_order
        self.fir_delays = fir_delays
        self.min_onset = min_onset
        # glm parameters
        self.mask = mask
        self.target_affine = target_affine
        self.target_shape = target_shape
        self.smoothing_fwhm = smoothing_fwhm
        if isinstance(memory, _basestring):
            self.memory = Memory(memory)
        else:
            self.memory = memory
        self.memory_level = memory_level
        self.standardize = standardize
        if signal_scaling is False:
            self.signal_scaling = signal_scaling
        elif signal_scaling in [0, 1, (0, 1)]:
            self.scaling_axis = signal_scaling
            self.signal_scaling = True
            self.standardize = False
        else:
            raise ValueError('signal_scaling must be "False", "0", "1"'
                             ' or "(0, 1)"')
        self.noise_model = noise_model
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.minimize_memory = minimize_memory
        # attributes
        self.labels_ = None
        self.results_ = None
        self.subject_label = subject_label

    def fit(self, run_imgs, events=None, confounds=None, design_matrices=None):
        """ Fit the GLM

        For each run:
        1. create design matrix X
        2. do a masker job: fMRI_data -> Y
        3. fit regression to (Y, X)

        Parameters
        ----------
        run_imgs: Niimg-like object or list of Niimg-like objects,
            See http://nilearn.github.io/manipulating_images/input_output.html#inputing-data-file-names-or-image-objects
            Data on which the GLM will be fitted. If this is a list,
            the affine is considered the same for all.

        events: pandas Dataframe or string or list of pandas DataFrames or
                   strings
                   
            fMRI events used to build design matrices. One events object
            expected per run_img. Ignored in case designs is not None.
            If string, then a path to a csv file is expected.

        confounds: pandas Dataframe or string or list of pandas DataFrames or
                   strings
                   
            Each column in a DataFrame corresponds to a confound variable
            to be included in the regression model of the respective run_img.
            The number of rows must match the number of volumes in the
            respective run_img. Ignored in case designs is not None.
            If string, then a path to a csv file is expected.

        design_matrices: pandas DataFrame or list of pandas DataFrames,
            Design matrices that will be used to fit the GLM. If given it
            takes precedence over events and confounds.

        """
        # Check arguments
        # Check imgs type
        if events is not None:
            _check_events_file_uses_tab_separators(events_files=events)
        if not isinstance(run_imgs, (list, tuple)):
            run_imgs = [run_imgs]
        if design_matrices is None:
            if events is None:
                raise ValueError('events or design matrices must be provided')
            if self.t_r is None:
                raise ValueError('t_r not given to FirstLevelModel object'
                                 ' to compute design from events')
        else:
            design_matrices = _check_run_tables(run_imgs, design_matrices,
                                                'design_matrices')
        # Check that number of events and confound files match number of runs
        # Also check that events and confound files can be loaded as DataFrame
        if events is not None:
            events = _check_run_tables(run_imgs, events, 'events')
        if confounds is not None:
            confounds = _check_run_tables(run_imgs, confounds, 'confounds')

        # Learn the mask
        if self.mask is False:
            # We create a dummy mask to preserve functionality of api
            ref_img = check_niimg(run_imgs[0])
            self.mask = Nifti1Image(np.ones(ref_img.shape[:3]), ref_img.affine)
        if not isinstance(self.mask, NiftiMasker):
            self.masker_ = NiftiMasker(mask_img=self.mask,
                                       smoothing_fwhm=self.smoothing_fwhm,
                                       target_affine=self.target_affine,
                                       standardize=self.standardize,
                                       mask_strategy='epi',
                                       t_r=self.t_r,
                                       memory=self.memory,
                                       verbose=max(0, self.verbose - 2),
                                       target_shape=self.target_shape,
                                       memory_level=self.memory_level)
            self.masker_.fit(run_imgs[0])
        else:
            if self.mask.mask_img_ is None and self.masker_ is None:
                self.masker_ = clone(self.mask)
                for param_name in [
                        'target_affine', 'target_shape', 'smoothing_fwhm',
                        't_r', 'memory', 'memory_level'
                ]:
                    our_param = getattr(self, param_name)
                    if our_param is None:
                        continue
                    if getattr(self.masker_, param_name) is not None:
                        warn('Parameter %s of the masker'
                             ' overriden' % param_name)
                    setattr(self.masker_, param_name, our_param)
                self.masker_.fit(run_imgs[0])
            else:
                self.masker_ = self.mask

        # For each run fit the model and keep only the regression results.
        self.labels_, self.results_, self.design_matrices_ = [], [], []
        n_runs = len(run_imgs)
        t0 = time.time()
        for run_idx, run_img in enumerate(run_imgs):
            # Report progress
            if self.verbose > 0:
                percent = float(run_idx) / n_runs
                percent = round(percent * 100, 2)
                dt = time.time() - t0
                # We use a max to avoid a division by zero
                if run_idx == 0:
                    remaining = 'go take a coffee, a big one'
                else:
                    remaining = (100. - percent) / max(0.01, percent) * dt
                    remaining = '%i seconds remaining' % remaining

                sys.stderr.write("Computing run %d out of %d runs (%s)\n" %
                                 (run_idx + 1, n_runs, remaining))

            # Build the experimental design for the glm
            run_img = check_niimg(run_img, ensure_ndim=4)
            if design_matrices is None:
                n_scans = run_img.get_data().shape[3]
                if confounds is not None:
                    confounds_matrix = confounds[run_idx].values
                    if confounds_matrix.shape[0] != n_scans:
                        raise ValueError('Rows in confounds does not match'
                                         'n_scans in run_img at index %d' %
                                         (run_idx, ))
                    confounds_names = confounds[run_idx].columns.tolist()
                else:
                    confounds_matrix = None
                    confounds_names = None
                start_time = self.slice_time_ref * self.t_r
                end_time = (n_scans - 1 + self.slice_time_ref) * self.t_r
                frame_times = np.linspace(start_time, end_time, n_scans)
                design = make_first_level_design_matrix(
                    frame_times, events[run_idx], self.hrf_model,
                    self.drift_model, self.period_cut, self.drift_order,
                    self.fir_delays, confounds_matrix, confounds_names,
                    self.min_onset)
            else:
                design = design_matrices[run_idx]
            self.design_matrices_.append(design)

            # Mask and prepare data for GLM
            if self.verbose > 1:
                t_masking = time.time()
                sys.stderr.write('Starting masker computation \r')

            Y = self.masker_.transform(run_img)

            if self.verbose > 1:
                t_masking = time.time() - t_masking
                sys.stderr.write('Masker took %d seconds       \n' % t_masking)

            if self.signal_scaling:
                Y, _ = mean_scaling(Y, self.scaling_axis)
            if self.memory:
                mem_glm = self.memory.cache(run_glm, ignore=['n_jobs'])
            else:
                mem_glm = run_glm

            # compute GLM
            if self.verbose > 1:
                t_glm = time.time()
                sys.stderr.write('Performing GLM computation\r')
            labels, results = mem_glm(Y,
                                      design.values,
                                      noise_model=self.noise_model,
                                      bins=100,
                                      n_jobs=self.n_jobs)
            if self.verbose > 1:
                t_glm = time.time() - t_glm
                sys.stderr.write('GLM took %d seconds         \n' % t_glm)

            self.labels_.append(labels)
            # We save memory if inspecting model details is not necessary
            if self.minimize_memory:
                for key in results:
                    results[key] = SimpleRegressionResults(results[key])
            self.results_.append(results)
            del Y

        # Report progress
        if self.verbose > 0:
            sys.stderr.write(
                "\nComputation of %d runs done in %i seconds\n\n" %
                (n_runs, time.time() - t0))

        return self

    def compute_contrast(self,
                         contrast_def,
                         stat_type=None,
                         output_type='z_score'):
        """Generate different outputs corresponding to
        the contrasts provided e.g. z_map, t_map, effects and variance.
        In multi-session case, outputs the fixed effects map.

        Parameters
        ----------
        contrast_def : str or array of shape (n_col) or list of (string or
                       array of shape (n_col))
                       
            where ``n_col`` is the number of columns of the design matrix,
            (one array per run). If only one array is provided when there
            are several runs, it will be assumed that the same contrast is
            desired for all runs. The string can be a formula compatible with
            the linear constraint of the Patsy library. Basically one can use
            the name of the conditions as they appear in the design matrix of
            the fitted model combined with operators /\*+- and numbers.
            Please checks the patsy documentation for formula examples:
            http://patsy.readthedocs.io/en/latest/API-reference.html#patsy.DesignInfo.linear_constraint

        stat_type : {'t', 'F'}, optional
            type of the contrast

        output_type : str, optional
            Type of the output map. Can be 'z_score', 'stat', 'p_value',
            'effect_size', 'effect_variance' or 'all'

        Returns
        -------
        output : Nifti1Image or dict
            The desired output image(s). If ``output_type == 'all'``, then
            the output is a dictionary of images, keyed by the type of image.

        """
        if self.labels_ is None or self.results_ is None:
            raise ValueError('The model has not been fit yet')

        if isinstance(contrast_def, (np.ndarray, str)):
            con_vals = [contrast_def]
        elif isinstance(contrast_def, (list, tuple)):
            con_vals = contrast_def
        else:
            raise ValueError('contrast_def must be an array or str or list of'
                             ' (array or str)')

        # Translate formulas to vectors with patsy
        design_info = DesignInfo(self.design_matrices_[0].columns.tolist())
        for cidx, con in enumerate(con_vals):
            if not isinstance(con, np.ndarray):
                con_vals[cidx] = design_info.linear_constraint(con).coefs

        n_runs = len(self.labels_)
        if len(con_vals) != n_runs:
            warn('One contrast given, assuming it for all %d runs' % n_runs)
            con_vals = con_vals * n_runs

        # 'all' is assumed to be the final entry; if adding more, place before 'all'
        valid_types = [
            'z_score', 'stat', 'p_value', 'effect_size', 'effect_variance',
            'all'
        ]
        if output_type not in valid_types:
            raise ValueError(
                'output_type must be one of {}'.format(valid_types))

        contrast = _fixed_effect_contrast(self.labels_, self.results_,
                                          con_vals, stat_type)

        output_types = valid_types[:-1] if output_type == 'all' else [
            output_type
        ]

        outputs = {}
        for output_type_ in output_types:
            estimate_ = getattr(contrast, output_type_)()
            # Prepare the returned images
            output = self.masker_.inverse_transform(estimate_)
            contrast_name = str(con_vals)
            output.header['descrip'] = ('%s of contrast %s' %
                                        (output_type_, contrast_name))
            outputs[output_type_] = output

        return outputs if output_type == 'all' else output
def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
                          gamma=5, metric='minkowski', p=2, algorithm='best',
                          memory=Memory(cachedir=None, verbose=0)):
    """Perform robust single linkage clustering from a vector array
    or distance matrix.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    cut : float
        The reachability distance value to cut the cluster heirarchy at
        to derive a flat cluster labelling.

    k : int, optional
        Reachability distances will be computed with regard to the `k`
        nearest neighbors. (default 5)

    alpha : float, optional
        Distance scaling for reachability distance computation. Reachability
        distance is computed as $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
        (default sqrt(2))

    gamma : int, optional
        Ignore any clusters in the flat clustering with size less than gamma,
        and declare points in such clusters as noise points. (default 5)

    metric : string, or callable, optional
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.

    algorithm : string, optional
        Exactly which algorithm to use; hdbscan has variants specialised
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of
        the data. You can force other options if you believe you know
        better. Options are:
            * ``generic``
            * ``best``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    memory : Instance of joblib.Memory or string (optional)
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    Returns
    -------
    labels : array [n_samples]
        Cluster labels for each point.  Noisy samples are given the label -1.

    single_linkage_tree : array [n_samples - 1, 4]
        The single linkage tree produced during clustering in scipy
        hierarchical clustering format
        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

    References
    ----------
    K. Chaudhuri and S. Dasgupta.
    "Rates of convergence for the cluster tree."
    In Advances in Neural Information Processing Systems, 2010.

    """

    if type(k) is not int or k < 1:
        raise ValueError('k must be an integer greater than zero!')

    if type(alpha) is not float or alpha < 1.0:
        raise ValueError('alpha must be a float greater than or equal to 1.0!')

    if type(gamma) is not int or gamma < 1:
        raise ValueError('gamma must be an integer greater than zero!')

    X = check_array(X, accept_sparse='csr')
    if isinstance(memory, six.string_types):
        memory = Memory(cachedir=memory, verbose=0)

    if algorithm != 'best':
        if algorithm == 'generic':
            single_linkage_tree = \
                memory.cache(_rsl_generic)(X, k, alpha, metric, p)
        elif algorithm == 'prims_kdtree':
            single_linkage_tree = \
                memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p)
        elif algorithm == 'prims_balltree':
            single_linkage_tree = \
                memory.cache(_rsl_prims_balltree)(X, k, alpha, metric, p)
        elif algorithm == 'boruvka_kdtree':
            single_linkage_tree = \
                memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p)
        elif algorithm == 'boruvka_balltree':
            single_linkage_tree = \
                memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p)
        else:
            raise TypeError('Unknown algorithm type %s specified' % algorithm)
    else:
        if issparse(X) or metric not in FAST_METRICS:  # We can't do much with sparse matrices ...
            single_linkage_tree = \
                memory.cache(_rsl_generic)(X, k, alpha, metric, p)
        elif metric in KDTree.valid_metrics:
            # Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 128:
                single_linkage_tree = \
                    memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p)
            else:
                single_linkage_tree = \
                    memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p)
        else:  # Metric is a valid BallTree metric
            # Need heuristic to decide when to go to boruvka; still debugging for now
            if X.shape[1] > 128:
                single_linkage_tree = \
                    memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p)
            else:
                single_linkage_tree = \
                    memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree
Example #49
0
for x in X:  # smooth data
    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()
X -= X.mean(axis=0)
X /= X.std(axis=0)

y = np.dot(X, coef.ravel())
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise  # add noise

# #############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(2)  # cross-validation generator for model selection
ridge = BayesianRidge()
cachedir = tempfile.mkdtemp()
mem = Memory(cachedir=cachedir, verbose=1)

# Ward agglomeration followed by BayesianRidge
connectivity = grid_to_graph(n_x=size, n_y=size)
ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity,
                            memory=mem)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
clf.fit(X, y)  # set the best parameters
coef_ = clf.best_estimator_.steps[-1][1].coef_
coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression)  # caching function
class FirstLevelModel(BaseEstimator, TransformerMixin, CacheMixin):
    """ Implementation of the General Linear Model for single session fMRI data

    Parameters
    ----------

    t_r: float
        This parameter indicates repetition times of the experimental runs.
        In seconds. It is necessary to correctly consider times in the design
        matrix. This parameter is also passed to nilearn.signal.clean.
        Please see the related documentation for details.

    slice_time_ref: float, optional (default 0.)
        This parameter indicates the time of the reference slice used in the
        slice timing preprocessing step of the experimental runs. It is
        expressed as a percentage of the t_r (time repetition), so it can have
        values between 0. and 1.

    hrf_model : string, optional
        This parameter specifies the hemodynamic response function (HRF) for
        the design matrices. It can be 'canonical', 'canonical with derivative'
        or 'fir'.

    drift_model : string, optional
        This parameter specifies the desired drift model for the design
        matrices. It can be 'polynomial', 'cosine' or 'blank'.

    period_cut : float, optional
        This parameter specifies the cut period of the low-pass filter in
        seconds for the design matrices.

    drift_order : int, optional
        This parameter specifices the order of the drift model (in case it is
        polynomial) for the design matrices.

    fir_delays : array of shape(n_onsets) or list, optional
        In case of FIR design, yields the array of delays used in the FIR
        model, in seconds.

    min_onset : float, optional
        This parameter specifies the minimal onset relative to the design
        (in seconds). Events that start before (slice_time_ref * t_r +
        min_onset) are not considered.

    mask: Niimg-like, NiftiMasker or MultiNiftiMasker object, optional,
        Mask to be used on data. If an instance of masker is passed,
        then its mask will be used. If no mask is given,
        it will be computed automatically by a MultiNiftiMasker with default
        parameters.

    target_affine: 3x3 or 4x4 matrix, optional
        This parameter is passed to nilearn.image.resample_img. Please see the
        related documentation for details.

    target_shape: 3-tuple of integers, optional
        This parameter is passed to nilearn.image.resample_img. Please see the
        related documentation for details.

    smoothing_fwhm: float, optional
        If smoothing_fwhm is not None, it gives the size in millimeters of the
        spatial smoothing to apply to the signal.

    memory: string, optional
        Path to the directory used to cache the masking process and the glm
        fit. By default, no caching is done. Creates instance of joblib.Memory.

    memory_level: integer, optional
        Rough estimator of the amount of memory used by caching. Higher value
        means more memory for caching.

    standardize : boolean, optional
        If standardize is True, the time-series are centered and normed:
        their variance is put to 1 in the time dimension.

    signal_scaling: False, int or (int, int), optional,
        If not False, fMRI signals are scaled to the mean value of scaling_axis
        given, which can be 0, 1 or (0, 1). 0 refers to mean scaling each voxel
        with respect to time, 1 refers to mean scaling each time point with
        respect to all voxels and (0, 1) refers to scaling with respect to
        voxels and time, which is known as grand mean scaling.
        Incompatible with standardize (standardize=False is enforced when
        signal_scaling is not False).

    noise_model : {'ar1', 'ols'}, optional
        The temporal variance model. Defaults to 'ar1'

    verbose : integer, optional
        Indicate the level of verbosity. By default, nothing is printed.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs', -2 'all CPUs but one', and so on.

    minimize_memory : boolean, optional
        Gets rid of some variables on the model fit results that are not
        necessary for contrast computation and would only be useful for
        further inspection of model details. This has an important impact
        on memory consumption. True by default.

    Attributes
    ----------
    labels : array of shape (n_voxels,),
        a map of values on voxels used to identify the corresponding model

    results : dict,
        with keys corresponding to the different labels values
        values are RegressionResults instances corresponding to the voxels
    """
    def __init__(self, t_r=None, slice_time_ref=0., hrf_model='glover',
                 drift_model='cosine', period_cut=128, drift_order=1,
                 fir_delays=[0], min_onset=-24, mask=None, target_affine=None,
                 target_shape=None, smoothing_fwhm=None, memory=Memory(None),
                 memory_level=1, standardize=False, signal_scaling=0,
                 noise_model='ar1', verbose=1, n_jobs=1,
                 minimize_memory=True):
        # design matrix parameters
        self.t_r = t_r
        self.slice_time_ref = slice_time_ref
        self.hrf_model = hrf_model
        self.drift_model = drift_model
        self.period_cut = period_cut
        self.drift_order = drift_order
        self.fir_delays = fir_delays
        self.min_onset = min_onset
        # glm parameters
        self.mask = mask
        self.target_affine = target_affine
        self.target_shape = target_shape
        self.smoothing_fwhm = smoothing_fwhm
        if isinstance(memory, _basestring):
            self.memory = Memory(memory)
        else:
            self.memory = memory
        self.memory_level = memory_level
        self.standardize = standardize
        if signal_scaling in [0, 1, (0, 1)]:
            self.scaling_axis = signal_scaling
            self.signal_scaling = True
            self.standardize = False
        elif signal_scaling is False:
            self.signal_scaling = signal_scaling
        else:
            raise ValueError('signal_scaling must be "False", "0", "1"'
                             ' or "(0, 1)"')
        self.noise_model = noise_model
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.minimize_memory = minimize_memory
        # attributes
        self.labels_ = None
        self.results_ = None

    def fit(self, run_imgs, paradigms=None, confounds=None,
            design_matrices=None):
        """ Fit the GLM

        For each run:
        1. create design matrix X
        2. do a masker job: fMRI_data -> Y
        3. fit regression to (Y, X)

        Parameters
        ----------
        run_imgs: Niimg-like object or list of Niimg-like objects,
            See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg.
            Data on which the GLM will be fitted. If this is a list,
            the affine is considered the same for all.

        paradigms: pandas Dataframe or string or list of pandas DataFrames or
                   strings,
            fMRI paradigms used to build design matrices. One paradigm expected
            per run_img. Ignored in case designs is not None.

        confounds: pandas Dataframe or string or list of pandas DataFrames or
                   strings,
            Each column in a DataFrame corresponds to a confound variable
            to be included in the regression model of the respective run_img.
            The number of rows must match the number of volumes in the
            respective run_img. Ignored in case designs is not None.

        design_matrices: pandas DataFrame or list of pandas DataFrames,
            Design matrices that will be used to fit the GLM.
        """
        # Check arguments
        # Check imgs type
        if not isinstance(run_imgs, (list, tuple)):
            run_imgs = [run_imgs]
        for rimg in run_imgs:
            if not isinstance(rimg, (_basestring, Nifti1Image)):
                raise ValueError('run_imgs must be Niimg-like object or list'
                                 ' of Niimg-like objects')
        # check all information necessary to build design matrices is available
        if design_matrices is None:
            if paradigms is None:
                raise ValueError('paradigms or design matrices must be provided')
            if self.t_r is None:
                raise ValueError('t_r not given to FirstLevelModel object'
                                 ' to compute design from paradigm')
        else:
            design_matrices = _check_run_tables(run_imgs, design_matrices,
                                                'design_matrices')
        # check the number of paradigm and confound files match number of runs
        # Also check paradigm and confound files can be loaded as DataFrame
        if paradigms is not None:
            paradigms = _check_run_tables(run_imgs, paradigms, 'paradigms')

        if confounds is not None:
            confounds = _check_run_tables(run_imgs, confounds, 'confounds')

        # Learn the mask
        if not isinstance(self.mask, NiftiMasker):
            self.masker_ = NiftiMasker(
                mask_img=self.mask, smoothing_fwhm=self.smoothing_fwhm,
                target_affine=self.target_affine,
                standardize=self.standardize, mask_strategy='epi',
                t_r=self.t_r, memory=self.memory,
                verbose=max(0, self.verbose - 1),
                target_shape=self.target_shape,
                memory_level=self.memory_level)
        else:
            self.masker_ = clone(self.mask)
            for param_name in ['target_affine', 'target_shape',
                               'smoothing_fwhm', 'low_pass', 'high_pass',
                               't_r', 'memory', 'memory_level']:
                our_param = getattr(self, param_name)
                if our_param is None:
                    continue
                if getattr(self.masker_, param_name) is not None:
                    warn('Parameter %s of the masker overriden' % param_name)
                setattr(self.masker_, param_name, our_param)
        self.masker_.fit(run_imgs[0])

        # For each run fit the model and keep only the regression results.
        self.labels_, self.results_, self.design_matrices_ = [], [], []
        n_runs = len(run_imgs)
        t0 = time.time()
        for run_idx, run_img in enumerate(run_imgs):
            # Report progress
            if self.verbose > 0:
                percent = float(run_idx) / n_runs
                percent = round(percent * 100, 2)
                dt = time.time() - t0
                # We use a max to avoid a division by zero
                if run_idx == 0:
                    remaining = 'go take a coffee, a big one'
                else:
                    remaining = (100. - percent) / max(0.01, percent) * dt
                    remaining = '%i seconds remaining' % remaining
                sys.stderr.write(" " * 100 + "\r")
                sys.stderr.write(
                    "Computing run %d out of %d runs (%s)\r"
                    % (run_idx, n_runs, remaining))

            # Build the experimental design for the glm
            run_img = check_niimg(run_img, ensure_ndim=4)
            if design_matrices is None:
                n_scans = run_img.get_data().shape[3]
                if confounds is not None:
                    confounds_matrix = confounds[run_idx].values
                    if confounds_matrix.shape[0] != n_scans:
                        raise ValueError('Rows in confounds does not match'
                                         'n_scans in run_img at index %d'
                                         % (run_idx,))
                    confounds_names = confounds[run_idx].columns
                else:
                    confounds_matrix = None
                    confounds_names = None
                start_time = self.slice_time_ref * self.t_r
                end_time = (n_scans - 1 + self.slice_time_ref) * self.t_r
                frame_times = np.linspace(start_time, end_time, n_scans)
                design = make_design_matrix(frame_times, paradigms[run_idx],
                                            self.hrf_model, self.drift_model,
                                            self.period_cut, self.drift_order,
                                            self.fir_delays, confounds_matrix,
                                            confounds_names, self.min_onset)
            else:
                design = design_matrices[run_idx]
            self.design_matrices_.append(design)

            # Compute GLM
            Y = self.masker_.transform(run_img)
            if self.signal_scaling:
                Y, _ = mean_scaling(Y, self.scaling_axis)
            if self.memory is not None:
                mem_glm = self.memory.cache(run_glm)
            else:
                mem_glm = run_glm
            labels, results = mem_glm(Y, design,
                                      noise_model=self.noise_model,
                                      bins=100, n_jobs=self.n_jobs)
            self.labels_.append(labels)
            # We save memory if inspecting model details is not necessary
            if self.minimize_memory:
                for key in results:
                    results[key] = SimpleRegressionResults(results[key])
            self.results_.append(results)
            del Y

        # Report progress
        if self.verbose > 0:
            sys.stderr.write("\nComputation of %d runs done in %i seconds\n"
                             % (n_runs, time.time() - t0))

        return self

    def compute_contrast(self, contrast_def, contrast_name=None,
                         stat_type=None, output_type='z_score'):
        """Generate different outputs corresponding to
        the contrasts provided e.g. z_map, t_map, effects and variance.
        In multi-session case, outputs the fixed effects map.

        Parameters
        ----------
        contrast_def : array or list of arrays of shape (n_col) or (n_run, n_col)
            where ``n_col`` is the number of columns of the design matrix,
            (one array per run). If only one array is provided when there
            are several runs, it will be assumed that the same contrast is
            desired for all runs

        contrast_name : str, optional
            name of the contrast

        stat_type : {'t', 'F'}, optional
            type of the contrast

        output_type : str, optional
            Type of the output map. Can be 'z_score', 'stat', 'p_value',
            'effect_size' or 'effect_variance'

        Returns
        -------
        output_image : Nifti1Image
            The desired output image

        """
        if self.labels_ is None or self.results_ is None:
            raise ValueError('The model has not been fit yet')

        if isinstance(contrast_def, np.ndarray):
            con_vals = [contrast_def]
        elif isinstance(contrast_def, (list, tuple)):
            con_vals = contrast_def
            for cidx, con in enumerate(contrast_def):
                if not isinstance(con, np.ndarray):
                    raise ValueError('contrast_def at index %i is not an'
                                     ' array' % cidx)
        else:
            raise ValueError('contrast_def must be an array or list of arrays')
        n_runs = len(self.labels_)
        if len(con_vals) != n_runs:
            warn('One contrast given, assuming it for all %d runs' % n_runs)
            con_vals = con_vals * n_runs
        if isinstance(output_type, _basestring):
            if output_type not in ['z_score', 'stat', 'p_value', 'effect_size',
                                   'effect_variance']:
                raise ValueError('output_type must be one of "z_score", "stat",'
                                 ' "p_value","effect_size" or "effect_variance"')
        else:
            raise ValueError('output_type must be one of "z_score", "stat",'
                             ' "p_value","effect_size" or "effect_variance"')

        if self.memory is not None:
            arg_ignore = ['labels', 'results']
            mem_contrast = self.memory.cache(_fixed_effect_contrast,
                                             ignore=arg_ignore)
        else:
            mem_contrast = _fixed_effect_contrast
        contrast = mem_contrast(self.labels_, self.results_, con_vals,
                                stat_type)

        estimate_ = getattr(contrast, output_type)()
        # Prepare the returned images
        output = self.masker_.inverse_transform(estimate_)
        if contrast_name is None:
            contrast_name = str(con_vals)
        output.get_header()['descrip'] = (
            '%s of contrast %s' % (output_type, contrast_name))
        return output
Example #51
0
# Author: Nelle Varoquaux <*****@*****.**>
# License: BSD

import numpy as np
import scipy as sp

from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster.mean_shift_ import MeanShift, estimate_bandwidth
from sklearn.externals.joblib import Memory

from skimage.data import camera

mem = Memory(cachedir='.')


def calculate_cluster(camera, camera_mat, quantile):
    bandwidth = estimate_bandwidth(camera_mat,
                                   quantile=quantile,
                                   n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(camera_mat)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    camera_clustered = camera.copy()