Beispiel #1
0
    def test_sensitivity_based_feature_selection(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())

        # of features to remove
        Nremove = 2

        # because the clf is already trained when computing the sensitivity
        # map, prevent retraining for transfer error calculation
        # Use absolute of the svm weights as sensitivity
        fe = SensitivityBasedFeatureSelection(
            sens_ana,
            feature_selector=FixedNElementTailSelector(2),
            enable_ca=["sensitivity", "selected_ids"])

        data = self.get_data()

        data_nfeatures = data.nfeatures

        fe.train(data)
        resds = fe(data)

        # fail if orig datasets are changed
        self.assertTrue(data.nfeatures == data_nfeatures)

        # silly check if nfeatures got a single one removed
        self.assertEqual(data.nfeatures,
                         resds.nfeatures + Nremove,
                         msg="We had to remove just a single feature")

        self.assertEqual(
            fe.ca.sensitivity.nfeatures,
            data_nfeatures,
            msg="Sensitivity have to have # of features equal to original")
Beispiel #2
0
    def __init__(self,
                 fmeasure,
                 pmeasure,
                 splitter,
                 fselector=FixedNElementTailSelector(1,
                                                     tail='upper',
                                                     mode='select'),
                 **kwargs):
        """Initialize incremental feature search

        Parameters
        ----------
        feature_measure : Measure
          Computed for each candidate feature selection. The measure has
          to compute a scalar value.
        performance_measure : Measure
          Compute against a test dataset for each incremental feature
          set.
        splitter: Splitter
          This splitter instance has to generate at least two dataset splits
          when called with the input dataset. The first split serves as the
          training dataset and the second as the evaluation dataset.
        """
        # bases init first
        IterativeFeatureSelection.__init__(self, fmeasure, pmeasure, splitter,
                                           fselector, **kwargs)
Beispiel #3
0
    def test_feature_selection_classifier_with_regression(self):
        from mvpa2.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa2.featsel.helpers import \
             FixedNElementTailSelector
        if sample_clf_reg is None:
            # none regression was found, so nothing to test
            return
        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(
            sens_ana, FixedNElementTailSelector(1, mode='discard'))

        # now test with regression-based classifier. The problem is
        # that it is determining predictions twice from values and
        # then setting the values from the results, which the second
        # time is set to predictions.  The final outcome is that the
        # values are actually predictions...
        dat = dataset_wizard(samples=np.random.randn(4, 10),
                             targets=[-1, -1, 1, 1])
        clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel)
        clf_reg.train(dat)
        _ = clf_reg.predict(dat.samples)
        self.failIf(
            (np.array(clf_reg.ca.estimates) -
             clf_reg.ca.predictions).sum() == 0,
            msg="Values were set to the predictions in %s." % sample_clf_reg)
Beispiel #4
0
    def test_feature_selection_pipeline(self):
        sens_ana = SillySensitivityAnalyzer()

        data = self.get_data()
        data_nfeatures = data.nfeatures

        # test silly one first ;-)
        self.assertEqual(
            sens_ana(data).samples[0, 0], -int(data_nfeatures / 2))

        # OLD: first remove 25% == 6, and then 4, total removing 10
        # NOW: test should be independent of the numerical number of features
        feature_selections = [
            SensitivityBasedFeatureSelection(sens_ana,
                                             FractionTailSelector(0.25)),
            SensitivityBasedFeatureSelection(sens_ana,
                                             FixedNElementTailSelector(4))
        ]

        # create a FeatureSelection pipeline
        feat_sel_pipeline = ChainMapper(feature_selections)

        feat_sel_pipeline.train(data)
        resds = feat_sel_pipeline(data)

        self.assertEqual(len(feat_sel_pipeline),
                         len(feature_selections),
                         msg="Test the property feature_selections")

        desired_nfeatures = int(np.ceil(data_nfeatures * 0.75))
        self.assertEqual([fe._oshape[0] for fe in feat_sel_pipeline],
                         [desired_nfeatures, desired_nfeatures - 4])
Beispiel #5
0
    def test_ifs(self, svm):

        # measure for feature selection criterion and performance assesment
        # use the SAME clf!
        errorfx = mean_mismatch_error
        fmeasure = CrossValidation(svm,
                                   NFoldPartitioner(),
                                   postproc=mean_sample())
        pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets'))

        ifs = IFS(fmeasure,
                  pmeasure,
                  Splitter('purpose', attr_values=['train', 'test']),
                  fselector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good


                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata.sa['purpose'] = np.repeat('train', len(wdata))
        tdata = self.get_data()
        tdata.sa['purpose'] = np.repeat('test', len(tdata))
        ds = vstack((wdata, tdata))
        orig_nfeatures = ds.nfeatures

        ifs.train(ds)
        resds = ifs(ds)

        # fail if orig datasets are changed
        self.assertTrue(ds.nfeatures == orig_nfeatures)

        # check that the features set with the least error is selected
        self.assertTrue(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.assertTrue(resds.nfeatures == e.argmin() + 1)

        # repeat with dataset where selection order is known
        wsignal = datasets['dumb2'].copy()
        wsignal.sa['purpose'] = np.repeat('train', len(wsignal))
        tsignal = datasets['dumb2'].copy()
        tsignal.sa['purpose'] = np.repeat('test', len(tsignal))
        signal = vstack((wsignal, tsignal))
        ifs.train(signal)
        resds = ifs(signal)
        self.assertTrue((resds.samples[:, 0] == signal.samples[:, 0]).all())
Beispiel #6
0
    def __test_fspipeline_with_split_classifier(self, basic_clf):
        #basic_clf = LinearNuSVMC()
        multi_clf = MulticlassClassifier(clf=basic_clf)
        #svm_weigths = LinearSVMWeights(svm)

        # Proper RFE: aggregate sensitivities across multiple splits,
        # but also due to multi class those need to be aggregated
        # somehow. Transfer error here should be 'leave-1-out' error
        # of split classifier itself
        sclf = SplitClassifier(clf=basic_clf)
        rfe = RFE(sensitivity_analyzer=sclf.get_sensitivity_analyzer(
            enable_ca=["sensitivities"]),
                  transfer_error=trans_error,
                  feature_selector=FeatureSelectionPipeline([
                      FractionTailSelector(0.5),
                      FixedNElementTailSelector(1)
                  ]),
                  train_pmeasure=True)

        # and we get sensitivity analyzer which works on splits and uses
        # sensitivity
        selected_features = rfe(self.dataset)
Beispiel #7
0
    def test_analyzer_with_split_classifier(self, clfds):
        """Test analyzers in split classifier
        """
        clf, ds = clfds  # unroll the tuple
        # We need to skip some LARSes here
        _sclf = str(clf)
        if 'LARS(' in _sclf and "type='stepwise'" in _sclf:
            # ADD KnownToFail thingie from NiPy
            return

        # To don't waste too much time testing lets limit to 3 splits
        nsplits = 3
        partitioner = NFoldPartitioner(count=nsplits)
        mclf = SplitClassifier(clf=clf,
                               partitioner=partitioner,
                               enable_ca=['training_stats', 'stats'])
        sana = mclf.get_sensitivity_analyzer(  # postproc=absolute_features(),
            pass_attr=['fa.nonbogus_targets'],
            enable_ca=["sensitivities"])

        ulabels = ds.uniquetargets
        nlabels = len(ulabels)
        # Can't rely on splitcfg since count-limit is done in __call__
        assert (nsplits == len(list(partitioner.generate(ds))))
        sens = sana(ds)
        assert ('nonbogus_targets' in sens.fa)  # were they passsed?
        # TODO: those few do not expose biases
        if not len(set(clf.__tags__).intersection(('lars', 'glmnet', 'gpr'))):
            assert ('biases' in sens.sa)
            # print sens.sa.biases
        # It should return either ...
        #  nlabels * nsplits
        req_nsamples = [nlabels * nsplits]
        if nlabels == 2:
            # A single sensitivity in case of binary
            req_nsamples += [nsplits]
        else:
            # and for pairs in case of multiclass
            req_nsamples += [(nlabels * (nlabels - 1) / 2) * nsplits]
            # and for 1-vs-1 embedded within Multiclass operating on
            # pairs (e.g. SMLR)
            req_nsamples += [req_nsamples[-1] * 2]

            # Also for regression_based -- they can do multiclass
            # but only 1 sensitivity is provided
            if 'regression_based' in clf.__tags__:
                req_nsamples += [nsplits]

        # # of features should correspond
        self.assertEqual(sens.shape[1], ds.nfeatures)
        # # of samples/sensitivities should also be reasonable
        self.assertTrue(sens.shape[0] in req_nsamples)

        # Check if labels are present
        self.assertTrue('splits' in sens.sa)
        self.assertTrue('targets' in sens.sa)
        # should be 1D -- otherwise dtype object
        self.assertTrue(sens.sa.targets.ndim == 1)

        sens_ulabels = sens.sa['targets'].unique
        # Some labels might be pairs(tuples) so ndarray would be of
        # dtype object and we would need to get them all
        if sens_ulabels.dtype is np.dtype('object'):
            sens_ulabels = np.unique(
                reduce(lambda x, y: x + y, [list(x) for x in sens_ulabels]))

        assert_array_equal(sens_ulabels, ds.sa['targets'].unique)

        errors = [x.percent_correct for x in sana.clf.ca.stats.matrices]

        # lets go through all sensitivities and see if we selected the right
        # features
        #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2:
        if '5%' in clf.descr \
               or (nlabels > 2 and 'regression_based' in clf.__tags__):
            # Some meta classifiers (5% of ANOVA) are too harsh ;-)
            # if we get less than 2 features with on-zero sensitivities we
            # cannot really test
            # Also -- regression based classifiers performance for multiclass
            # is expected to suck in general
            return

        if cfg.getboolean('tests', 'labile', default='yes'):
            for conf_matrix in [sana.clf.ca.training_stats] \
                              + sana.clf.ca.stats.matrices:
                self.assertTrue(
                    conf_matrix.percent_correct>=70,
                    msg="We must have trained on each one more or " \
                    "less correctly. Got %f%% correct on %d labels" %
                    (conf_matrix.percent_correct,
                     nlabels))

        # Since  now we have per split and possibly per label -- lets just find
        # mean per each feature per label across splits
        sensm = FxMapper('samples', lambda x: np.sum(x),
                         uattrs=['targets']).forward(sens)
        sensgm = maxofabs_sample().forward(sensm)  # global max of abs of means

        assert_equal(sensgm.shape[0], 1)
        assert_equal(sensgm.shape[1], ds.nfeatures)

        selected = FixedNElementTailSelector(len(ds.a.bogus_features))(
            sensgm.samples[0])

        if cfg.getboolean('tests', 'labile', default='yes'):

            self.assertEqual(
                set(selected),
                set(ds.a.nonbogus_features),
                msg="At the end we should have selected the right features. "
                "Chose %s whenever nonbogus are %s" %
                (selected, ds.a.nonbogus_features))

            # Now test each one per label
            # TODO: collect all failures and spit them out at once --
            #       that would make it easy to see if the sensitivity
            #       just has incorrect order of labels assigned
            for sens1 in sensm:
                labels1 = sens1.targets  # labels (1) for this sensitivity
                lndim = labels1.ndim
                label = labels1[0]  # current label

                # XXX whole lndim comparison should be gone after
                #     things get fixed and we arrive here with a tuple!
                if lndim == 1:  # just a single label
                    self.assertTrue(label in ulabels)

                    ilabel_all = np.where(ds.fa.nonbogus_targets == label)[0]
                    # should have just 1 feature for the label
                    self.assertEqual(len(ilabel_all), 1)
                    ilabel = ilabel_all[0]

                    maxsensi = np.argmax(sens1)  # index of max sensitivity
                    self.assertEqual(
                        maxsensi, ilabel,
                        "Maximal sensitivity for %s was found in %i whenever"
                        " original feature was %i for nonbogus features %s" %
                        (labels1, maxsensi, ilabel, ds.a.nonbogus_features))
                elif lndim == 2 and labels1.shape[1] == 2:  # pair of labels
                    # we should have highest (in abs) coefficients in
                    # those two labels
                    maxsensi2 = np.argsort(np.abs(sens1))[0][-2:]
                    ilabel2 = [
                        np.where(ds.fa.nonbogus_targets == l)[0][0]
                        for l in label
                    ]
                    self.assertEqual(
                        set(maxsensi2), set(ilabel2),
                        "Maximal sensitivity for %s was found in %s whenever"
                        " original features were %s for nonbogus features %s" %
                        (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features))
                    """
                    # Now test for the sign of each one in pair ;) in
                    # all binary problems L1 (-1) -> L2(+1), then
                    # weights for L2 should be positive.  to test for
                    # L1 -- invert the sign
                    # We already know (if we haven't failed in previous test),
                    # that those 2 were the strongest -- so check only signs
                    """
                    self.assertTrue(
                        sens1.samples[0, ilabel2[0]] < 0,
                        "With %i classes in pair %s got feature %i for %r >= 0"
                        % (nlabels, label, ilabel2[0], label[0]))
                    self.assertTrue(
                        sens1.samples[0, ilabel2[1]] > 0,
                        "With %i classes in pair %s got feature %i for %r <= 0"
                        % (nlabels, label, ilabel2[1], label[1]))
                else:
                    # yoh could be wrong at this assumption... time will show
                    self.fail("Got unknown number labels per sensitivity: %s."
                              " Should be either a single label or a pair" %
                              labels1)
Beispiel #8
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf,
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        cvmeasure = CrossValidation(clf,
                                    NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
                # because the clf is already trained when computing the sensitivity
                # map, prevent retraining for transfer error calculation
                # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                 pmeasure,
                 Splitter('train'),
                 fselector=FixedNElementTailSelector(1),
                 train_pmeasure=False), self.get_data()),
                # use cross-validation within training to get error for the stopping point
                # but use full training data to derive sensitivity
            (
                RFE(
                    sens_ana,
                    cvmeasure,
                    Repeater(
                        2
                    ),  # give the same full dataset to sens_ana and cvmeasure
                    fselector=FractionTailSelector(0.70,
                                                   mode='select',
                                                   tail='upper'),
                    train_pmeasure=True),
                normal_feature_dataset(perlabel=20,
                                       nchunks=5,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5)),
                # use cross-validation (via SplitClassifier) and get mean
                # of normed sensitivities across those splits
            (
                RFE(
                    rfesvm_split.get_sensitivity_analyzer(
                        postproc=ChainMapper([
                            FxMapper('features', l2_normed),
                            FxMapper('samples', np.mean),
                            FxMapper('samples', np.abs)
                        ])),
                    ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                    Repeater(
                        2),  #  we will use the same full cv-training dataset
                    fselector=FractionTailSelector(0.50,
                                                   mode='select',
                                                   tail='upper'),
                    stopping_criterion=NBackHistoryStopCrit(
                        BestDetector(), 10),
                    train_pmeasure=
                    False,  # we just extract it from existing confusion
                    update_sensitivity=True),
                normal_feature_dataset(perlabel=28,
                                       nchunks=7,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5))
        ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures -
                                    e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less(imin, len(e))
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue(1 < imin < len(e) - 1)
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all())

            # check if history has elements for every step
            self.assertTrue(
                set(rfe.ca.history) == set(range(len(np.array(
                    rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1] == len(
                np.where(rfe.ca.history == max(rfe.ca.history))[0]))
Beispiel #9
0
    def test_feature_selector(self):
        """Test feature selector"""
        # remove 10% weekest
        selector = FractionTailSelector(0.1)
        data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9])
        # == rank [4, 5, 6, 7, 0, 3, 2, 9, 1, 8]
        target10 = np.array([0, 1, 2, 3, 5, 6, 7, 8, 9])
        target30 = np.array([0, 1, 2, 3, 7, 8, 9])

        self.assertRaises(UnknownStateError, selector.ca.__getattribute__,
                          'ndiscarded')
        self.assertTrue((selector(data) == target10).all())
        selector.felements = 0.30  # discard 30%
        self.assertTrue(selector.felements == 0.3)
        self.assertTrue((selector(data) == target30).all())
        self.assertTrue(selector.ca.ndiscarded == 3)  # se 3 were discarded

        selector = FixedNElementTailSelector(1)
        #                   0   1   2  3   4    5  6  7  8   9
        data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9])
        self.assertTrue((selector(data) == target10).all())

        selector.nelements = 3
        self.assertTrue(selector.nelements == 3)
        self.assertTrue((selector(data) == target30).all())
        self.assertTrue(selector.ca.ndiscarded == 3)

        # test range selector
        # simple range 'above'
        self.assertTrue((RangeElementSelector(lower=0)(data) == \
                         np.array([0,1,2,3,7,8,9])).all())

        self.assertTrue((RangeElementSelector(lower=0,
                                              inclusive=True)(data) == \
                         np.array([0,1,2,3,5,6,7,8,9])).all())

        self.assertTrue((RangeElementSelector(lower=0, mode='discard',
                                              inclusive=True)(data) == \
                         np.array([4])).all())

        # simple range 'below'
        self.assertTrue((RangeElementSelector(upper=2)(data) == \
                         np.array([4,5,6])).all())

        self.assertTrue((RangeElementSelector(upper=2,
                                              inclusive=True)(data) == \
                         np.array([4,5,6,7])).all())

        self.assertTrue((RangeElementSelector(upper=2, mode='discard',
                                              inclusive=True)(data) == \
                         np.array([0,1,2,3,8,9])).all())

        # ranges
        self.assertTrue((RangeElementSelector(lower=2, upper=9)(data) == \
                         np.array([0,2,3])).all())

        self.assertTrue((RangeElementSelector(lower=2, upper=9,
                                              inclusive=True)(data) == \
                         np.array([0,2,3,7,9])).all())

        self.assertTrue((RangeElementSelector(
            upper=2, lower=9, mode='discard',
            inclusive=True)(data) == RangeElementSelector(
                lower=2, upper=9, inclusive=False)(data)).all())

        # non-0 elements -- should be equivalent to np.nonzero()[0]
        self.assertTrue((RangeElementSelector()(data) == \
                         np.nonzero(data)[0]).all())
 def __call__(self, datasets):
     ref_ds = self.ref_ds
     nsamples, nfeatures = datasets[ref_ds].shape
     if 'roi_seed' in datasets[ref_ds].fa and np.any(
             datasets[ref_ds].fa['roi_seed']):
         seed_index = np.where(datasets[ref_ds].fa.roi_seed)
     else:
         if not self.full_matrix:
             raise ValueError(
                 "Setting full_matrix=False requires roi_seed `fa` in the "
                 "reference dataset indicating center feature and some "
                 "feature(s) being marked as `roi_seed`.")
         seed_index = None
     # Voxel selection within Searchlight
     # Usual metric of between-subject between-voxel correspondence
     # Making sure ref_ds has most features, if not force feature selection on others
     nfeatures_all = [sd.nfeatures for sd in datasets]
     bigger_ds_idxs = [
         i for i, sd in enumerate(datasets) if sd.nfeatures > nfeatures
     ]
     if self.featsel != 1.0:
         # computing feature scores from the data
         feature_scores = compute_feature_scores(datasets,
                                                 self.exclude_from_model)
         nfeatures_sel = nfeatures  # default
         if self.featsel < 1.0 and int(self.featsel * nfeatures) > 0:
             nfeatures_sel = int(self.featsel * nfeatures)
         if self.featsel > 1.0:
             nfeatures_sel = min(nfeatures, self.featsel)
         fselector = FixedNElementTailSelector(nfeatures_sel,
                                               tail='upper',
                                               mode='select',
                                               sort=False)
         # XXX Artificially make the seed_index feature score high to keep it(?)
         if self.use_same_features:
             if len(self.exclude_from_model):
                 feature_scores = [
                     feature_scores[ifs] for ifs in range(len(datasets))
                     if ifs not in self.exclude_from_model
                 ]
             feature_scores = np.mean(np.asarray(feature_scores), axis=0)
             if seed_index is not None:
                 feature_scores[seed_index] = max(feature_scores)
             features_selected = fselector(feature_scores)
             datasets = [sd[:, features_selected] for sd in datasets]
         else:
             features_selected = []
             for fs in feature_scores:
                 if seed_index is not None:
                     fs[seed_index] = max(fs)
                 features_selected.append(fselector(fs))
             datasets = [
                 sd[:, fsel]
                 for fsel, sd in zip(features_selected, datasets)
             ]
     elif bigger_ds_idxs:
         # compute feature scores and select for bigger datasets
         feature_scores = compute_feature_scores(datasets,
                                                 self.exclude_from_model)
         feature_scores = [feature_scores[isub] for isub in bigger_ds_idxs]
         fselector = FixedNElementTailSelector(nfeatures,
                                               tail='upper',
                                               mode='select',
                                               sort=False)
         features_selected = [fselector(fs) for fs in feature_scores]
         for selected_features, isub in zip(features_selected,
                                            bigger_ds_idxs):
             datasets[isub] = datasets[isub][:, selected_features]
     # Try hyperalignment
     try:
         # it is crucial to retrain hyperalignment, otherwise it would simply
         # project into the common space of a previous iteration
         if len(self.exclude_from_model) == 0:
             self.hyperalignment.train(datasets)
         else:
             self.hyperalignment.train([
                 datasets[i] for i in range(len(datasets))
                 if i not in self.exclude_from_model
             ])
         mappers = self.hyperalignment(datasets)
         if mappers[0].proj.dtype is self.dtype:
             mappers = [m.proj for m in mappers]
         else:
             mappers = [m.proj.astype(self.dtype) for m in mappers]
         if self.featsel != 1.0:
             # Reshape the projection matrix from selected to all features
             mappers_full = [
                 np.zeros((nfeatures_all[im], nfeatures_all[ref_ds]))
                 for im in range(len(mappers))
             ]
             if self.use_same_features:
                 for mf, m in zip(mappers_full, mappers):
                     mf[np.ix_(features_selected, features_selected)] = m
             else:
                 for mf, m, fsel in zip(mappers_full, mappers,
                                        features_selected):
                     mf[np.ix_(fsel, features_selected[ref_ds])] = m
             mappers = mappers_full
         elif bigger_ds_idxs:
             for selected_features, isub in zip(features_selected,
                                                bigger_ds_idxs):
                 mapper_full = np.zeros(
                     (nfeatures_all[isub], nfeatures_all[ref_ds]))
                 mapper_full[np.ix_(selected_features,
                                    range(nfeatures))] = mappers[isub]
                 mappers[isub] = mapper_full
     except LinAlgError:
         print "SVD didn't converge. Try with a new reference, may be."
         mappers = [np.eye(nfeatures, dtype='int')] * len(datasets)
     # Extract only the row/column corresponding to the center voxel if full_matrix is False
     if not self.full_matrix:
         mappers = [np.squeeze(m[:, seed_index]) for m in mappers]
     return mappers
Beispiel #11
0
        descr="kNN on SMLR(lm=1) non-0")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FractionTailSelector(0.05, mode='select', tail='upper')),
        descr="kNN on 5%(ANOVA)")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FixedNElementTailSelector(50, mode='select', tail='upper')),
        descr="kNN on 50(ANOVA)")

# GNB
clfswh += GNB(descr="GNB()")
clfswh += GNB(common_variance=True, descr="GNB(common_variance=True)")
clfswh += GNB(prior='uniform', descr="GNB(prior='uniform')")
clfswh += \
    FeatureSelectionClassifier(
        GNB(),
        SensitivityBasedFeatureSelection(
           OneWayAnova(),
           FractionTailSelector(0.05, mode='select', tail='upper')),
        descr="GNB on 5%(ANOVA)")

# GPR
Beispiel #12
0
    
    data = np.concatenate(data)
    labels = np.concatenate(labels)
    
    return data, labels.astype(np.int)

rois = ['aSTG', 'HG', 'pSTG']

for sub_id in range(1, 21):
    data = []
    for roi in rois:
        data_path = os.path.join(data_dir, roi)
        tmp_data, label = load_data(data_path, sub_id)
        data.append(tmp_data)
    data = np.concatenate(data, axis=1)
    data = np.concatenate([data[i,:,:].T for i in range(len(data))])

    ds = Dataset(data)
    ds.sa['time_coords'] = np.linspace(0, len(ds)-1, len(ds))
    events = [{'onset': i*5, 'duration': 5, 'targets':label[i], 'chunks':i+1} for i in range(int(len(ds)/5))]

    hrf_estimates = fit_event_hrf_model(ds, events, time_attr='time_coords', condition_attr=('targets', 'chunks'), 
                                    design_kwargs=dict(drift_model='blank'), glmfit_kwargs=dict(model='ols'),
                                    return_model=True)

    fsel = SensitivityBasedFeatureSelection(OneWayAnova(), FixedNElementTailSelector(5000, mode='select', tail='upper'))

    fsel.train(hrf_estimates)
    ds_p = fsel(hrf_estimates)

    np.save('feat_sub{:03d}'.format(sub_id), ds_p.samples)