Exemple #1
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    #ds_unbalanced = ds.copy()
    #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    #mask_superord = ds_unbalanced.sa.superord == 'super1'
    #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4),
                            sa={
                                'subord': [0, 0, 1, 2],
                                'superord': [1, 1, 2, 2]
                            })

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # now the new implementation
    factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'),
                                    attr='superord')

    partitions_npart = [p.sa.partitions for p in npart.generate(ds)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)]

    assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart))

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr='subord')
    partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)]
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_1super)
    ]
    assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart))

    # smoke test for unbalanced subord classes
    warning_msg = 'One or more superordinate attributes do not have the same '\
                  'number of subordinate attributes. This could yield to '\
                  'unbalanced partitions.'
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        partitions_factpart = [
            p.sa.partitions for p in factpart.generate(ds_unbalanced)
        ]

    partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in \
            zip(partitions_factpart, partitions_unbalanced,
                superord_unbalanced, subord_unbalanced):
        assert_array_equal(out_part, true_part)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(),
                            ds_unbalanced[out_part == 2].sa.superord.tolist()),
                           super_out)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(),
                            ds_unbalanced[out_part == 2].sa.subord.tolist()),
                           sub_out)

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4),
                       sa={
                           'subord': range(4),
                           'superord': [1, 2] * 2
                       })
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_dummy)
    ]
    assert_array_equal(
        partitions_factpart,
        [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
 def partition(ds_=ds, **kwargs):
     partitioner = FactorialPartitioner(
         partitioner=NFoldPartitioner(attr='targets'),
         attr='chunks',
         **kwargs)
     return [p.sa.partitions for p in partitioner.generate(ds_)]
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    #ds_unbalanced = ds.copy()
    #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    #mask_superord = ds_unbalanced.sa.superord == 'super1'
    #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4),
                            sa={
                                'subord': [0, 0, 1, 2],
                                'superord': [1, 1, 2, 2]
                            })

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    def partition(partitioner, ds_=ds):
        return [p.sa.partitions for p in partitioner.generate(ds_)]

    # now the new implementation
    # common kwargs
    factkw = dict(partitioner=NFoldPartitioner(attr='subord'), attr='superord')

    fpart = FactorialPartitioner(**factkw)
    p_npart = partition(npart)
    p_fpart = partition(fpart)

    assert_array_equal(np.sort(p_npart), np.sort(p_fpart))

    fpart2 = FactorialPartitioner(count=2,
                                  selection_strategy='first',
                                  **factkw)
    p_fpart2 = partition(fpart2)
    assert_equal(len(p_fpart), 8)
    assert_equal(len(p_fpart2), 2)
    assert_array_equal(p_fpart[:2], p_fpart2)

    # 1 equidistant -- should be the first one
    fpart1 = FactorialPartitioner(count=1, **factkw)
    p_fpart1 = partition(fpart1)
    assert_equal(len(p_fpart1), 1)
    assert_array_equal(p_fpart[:1], p_fpart1)

    # 2 equidistant
    fpart2 = FactorialPartitioner(count=2, **factkw)
    p_fpart2 = partition(fpart2)
    assert_equal(len(p_fpart2), 2)
    assert_array_equal(p_fpart[::4], p_fpart2)

    # without count -- should be all of them in original order
    fpartr = FactorialPartitioner(selection_strategy='random', **factkw)
    assert_array_equal(p_fpart, partition(fpartr))

    # but if with a count we should get some selection
    fpartr2 = FactorialPartitioner(selection_strategy='random',
                                   count=2,
                                   **factkw)
    # Let's generate a number of random selections:
    rand2_partitions = [partition(fpartr2) for i in xrange(10)]
    for p in rand2_partitions:
        assert_equal(len(p), 2)
    # majority of them must be different
    assert len(set([tuple(map(tuple, x)) for x in rand2_partitions])) >= 5

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr='subord')
    p_nfold = partition(nfold, ds_1super)
    p_fpart = partition(fpart, ds_1super)
    assert_array_equal(np.sort(p_nfold), np.sort(p_fpart))

    # smoke test for unbalanced subord classes
    warning_msg = 'One or more superordinate attributes do not have the same '\
                  'number of subordinate attributes. This could yield to '\
                  'unbalanced partitions.'
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        p_fpart = partition(fpart, ds_unbalanced)

    p_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in \
            zip(p_fpart, p_unbalanced,
                superord_unbalanced, subord_unbalanced):
        assert_array_equal(out_part, true_part)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(),
                            ds_unbalanced[out_part == 2].sa.superord.tolist()),
                           super_out)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(),
                            ds_unbalanced[out_part == 2].sa.subord.tolist()),
                           sub_out)

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4),
                       sa={
                           'subord': range(4),
                           'superord': [1, 2] * 2
                       })
    p_fpart = partition(fpart, ds_dummy)
    assert_array_equal(
        p_fpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
Exemple #4
0
def main(infile,
         outdir,
         radius,
         mask,
         zscoring,
         classification,
         derivs=True,
         debugging=False,
         permute=None,
         decoder='svm',
         errors=False):
    # gime more
    if debugging:
        debug.active += ["SLC"]
    print('Loading {0}'.format(infile))
    ds = h5load(infile)
    # check we have derivatives too
    if derivs and 'derivs' not in ds.fa:
        raise ValueError(
            'Dataset {0} does not contain derivatives'.format(infile))

    # let's try familiar vs unfamiliar
    if classification in [
            'familiar_vs_unfamiliar', 'familiar_vs_unfamiliar-id',
            'familiar_vs_unfamiliar-id-chunks', 'identity-all',
            'identity-familiar', 'identity-unfamiliar'
    ]:
        ds = ds[ds.sa.condition != 'self']
        # permute if needed
        if permute:
            if classification != 'familiar_vs_unfamiliar-id':
                ds = shuffle_sa(ds, rand_seed=permute)
            else:
                # for familiar_vs_unfamiliar-id we need a fancier perm
                perm = get_unique_combs(8)[permute - 1]
                perm = flatten(perm)
                unique_conds = np.unique(ds.sa.condition)
                mapperm = dict()
                for i, p in enumerate(perm):
                    mapperm[unique_conds[i]] = unique_conds[p]
                for i in range(ds.nsamples):
                    this_cond = ds.sa.condition[i]
                    ds.sa.condition[i] = mapperm[this_cond]
                print("USING PERMUTATION {0}".format(mapperm))
        ds.sa['familiarity'] = [
            'familiar' if 'friend' in a else 'control' for a in ds.sa.condition
        ]
    else:
        raise NotImplementedError('Classification not implemented')

    # if we are using a dataset with derivatives but we don't want to use them
    # as features, extract only the non-derivatives features
    sfx = ''
    if 'derivs' in ds.fa and not derivs:
        ds = ds[:, ds.fa.derivs == 0]
        sfx += '_betaderivs'

    # set up clf and cv
    if decoder == 'svm':
        clf = LinearCSVMC()
    elif decoder == 'gnb':
        clf = GNB()
    else:
        raise ValueError(
            'I have no clue about this classifier {0}'.format(decoder))

    if classification == 'familiar_vs_unfamiliar':
        ds.sa['targets'] = ds.sa['familiarity']
        partitioner = NFoldPartitioner()
    elif classification == 'familiar_vs_unfamiliar-id':
        ds.sa['targets'] = ds.sa['familiarity']
        partitioner = FactorialPartitioner(NFoldPartitioner(attr='condition'),
                                           attr='targets')
        #if permute:
        #    rng = np.random.RandomState(permute)
        #    permutator = AttributePermutator(['familiarity'],
        #            limit=['partitions', 'chunks'],
        #            rng=rng)
        #    partitioner = ChainNode([partitioner, permutator], space='partitions')
    elif classification == 'familiar_vs_unfamiliar-id-chunks':
        ds.sa['targets'] = ds.sa['familiarity']
        # to do within chunks cross-validation across identities
        partitioner = ChainNode([
            FactorialPartitioner(NFoldPartitioner(attr='condition'),
                                 attr='familiarity'),
            ExcludeTargetsCombinationsPartitioner(
                k=1, targets_attr='chunks', space='partitions')
        ],
                                space='partitions')
    elif classification == 'identity-all':
        ds.sa['targets'] = ds.sa['condition']
        partitioner = NFoldPartitioner()
    elif classification == 'identity-familiar':
        ds.sa['targets'] = ds.sa['condition']
        ds = ds.select(
            sadict={'condition': ['friend' + str(i) for i in range(1, 5)]})
        assert (ds.nsamples == 44)
        partitioner = NFoldPartitioner()
    elif classification == 'identity-unfamiliar':
        ds.sa['targets'] = ds.sa['condition']
        ds = ds.select(
            sadict={'condition': ['control' + str(i) for i in range(1, 5)]})
        assert (ds.nsamples == 44)
        partitioner = NFoldPartitioner()

    cv = CrossValidation(clf, partitioner)

    if mask:
        mask_ds = fmri_dataset(mask)
        if derivs:
            assert (np.all(mask_ds.fa.voxel_indices == ds.fa.voxel_indices[
                ds.fa.derivs == 0]))
        else:
            assert (np.all(mask_ds.fa.voxel_indices == ds.fa.voxel_indices))
        assert (len(mask_ds) == 1)
        mask_ = mask_ds.samples[0]  # extract mask as the first sample
        #assert(np.all(mask_ == mask_ds.samples.flatten()))
        if derivs:
            # need to make the mask bigger
            mask_ = np.tile(mask_, 2)
        ds = ds[:, mask_ > 0]
    if derivs:
        assert (np.all(ds.fa.voxel_indices[ds.fa.derivs == 0] ==
                       ds.fa.voxel_indices[ds.fa.derivs == 1]))
    #ds = remove_invariant_features(ds)
    # zscore within each chunk
    if zscoring:
        zscore(ds, chunks_attr='chunks', dtype='float32')

    # copy for efficiency
    ds_ = ds.copy(deep=False,
                  sa=['targets', 'chunks', 'familiarity', 'condition'],
                  fa=['voxel_indices', 'derivs'],
                  a=['mapper'])
    print(ds_)

    if derivs:
        sl = Searchlight(cv,
                         IndexQueryEngine(voxel_indices=Sphere(radius),
                                          derivs=Sphere(2)),
                         postproc=mean_sample(),
                         roi_ids=np.where(ds_.fa.derivs == 0)[0],
                         nproc=8)
    else:
        sl = sphere_searchlight(
            cv,
            radius=radius,
            space='voxel_indices',
            #center_ids=range(0, 1000),
            postproc=mean_sample(),
            nproc=8)

    # run it! -- oh, PyMVPA!
    sl_map = sl(ds_)
    # copy mapper
    sl_map.a = ds.a
    # remove unnecessary field to make file smaller
    del sl_map.a['add_regs']

    if not errors:
        sl_map.samples *= -1
        sl_map.samples += 1
    # reduce size
    sl_map.samples = sl_map.samples.astype('float32')

    # save
    fnout = 'sl'
    if mask:
        fnout += 'msk'
    if zscoring:
        fnout += 'z'
    fnout += str(radius) + 'vx'
    if derivs:
        fnout += '_featderivs'
        sfx = ''
    fnout += sfx
    fnout += '_' + decoder

    sl_out = pjoin(outdir, fnout, classification)
    try:
        os.makedirs(sl_out)
    except OSError:
        pass

    print('Saving in {0}'.format(sl_out))
    fnslmap = 'sl_map'
    if permute:
        fnslmap += '_perm{0:03d}'.format(permute)
    fnslmap += '.hdf5'
    h5save(pjoin(sl_out, fnslmap), sl_map)
Exemple #5
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6, snr=100, perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5  # pure signal! ;)
    )
    ds.sa["subord"] = ds.sa.targets.copy()
    ds.sa["superord"] = ["super%d" % (int(i[1]) % 3,) for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa["superord"] = ["super1" for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    # ds_unbalanced = ds.copy()
    # nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    # mask_superord = ds_unbalanced.sa.superord == 'super1'
    # uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    # ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4), sa={"subord": [0, 0, 1, 2], "superord": [1, 1, 2, 2]})

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa["superord"].unique), attr="subord"),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([("partitions", 2), ("superord", {"uvalues": ds.sa["superord"].unique, "balanced": True})]),
        ],
        space="partitions",
    )

    # now the new implementation
    factpart = FactorialPartitioner(NFoldPartitioner(attr="subord"), attr="superord")

    partitions_npart = [p.sa.partitions for p in npart.generate(ds)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)]

    assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart))

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr="subord")
    partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_1super)]
    assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart))

    # smoke test for unbalanced subord classes
    warning_msg = (
        "One or more superordinate attributes do not have the same "
        "number of subordinate attributes. This could yield to "
        "unbalanced partitions."
    )
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_unbalanced)]

    partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in zip(
        partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced
    ):
        assert_array_equal(out_part, true_part)
        assert_array_equal(
            (ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()),
            super_out,
        )
        assert_array_equal(
            (ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out
        )

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4), sa={"subord": range(4), "superord": [1, 2] * 2})
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_dummy)]
    assert_array_equal(partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
Exemple #6
0
 def partition(ds_=ds, **kwargs):
     partitioner = FactorialPartitioner(
         partitioner=NFoldPartitioner(attr='targets'),
         attr='chunks',
         **kwargs)
     return [p.sa.partitions for p in partitioner.generate(ds_)]