Esempio n. 1
0
def calc_mv(clf, scorer, regions=None, processes=7, method='sequential'):
    n_regions = clf.data[1].shape[0]
    if regions is None:
        regions = range(0, n_regions)

    if processes > 1:
        from multiprocessing import Pool
        pool = Pool(processes=processes)
    else:
        pool = itertools

    pb = tools.ProgressBar(len(regions), start=True)

    overall_results = []
    for result in pool.imap(
            calc_mv_parallel,
            itertools.izip(
                itertools.repeat((clf.data, clf.classifier, scorer,
                                  clf.feature_importances, method)), regions)):
        pb.next()
        for row in result:
            overall_results.append(row)

    return pd.DataFrame(overall_results,
                        columns=['score', 'num_features', 'region'])
Esempio n. 2
0
def calc_mv_classifier(clf, scorer, regions=None, processes=7, method='sequential'):
    import os.path as path
    from tempfile import mkdtemp

    n_regions = clf.data.shape[0]
    if regions is None:
        regions = range(0, n_regions)
    
    if processes > 1:
        from multiprocessing import Pool
        pool = Pool(processes=processes)
    else:
        pool = itertools

    pb = tools.ProgressBar(len(regions), start=True)

    filename = path.join(mkdtemp(), 'data.dat')
    data = np.memmap(filename, dtype='object', mode='w+', shape=clf.comp_dims)
    data[:] = clf.data[:]

    overall_results = []
    for result in pool.imap(calc_mv_parallel_classifier, itertools.izip(itertools.repeat((filename, clf.classifier, scorer,
        clf.comp_dims, clf.feature_importances, np.array(clf.feature_names), method)), regions)):
        pb.next()
        for row in result:
            overall_results.append(row)
   
    overall_results = pd.DataFrame(overall_results, columns=['score', 'num_features', 'region', 'feature'])
    overall_results.region += 1
    return overall_results
Esempio n. 3
0
    def load_data(self, features, X_threshold):
        """ Load data into c_data """
        from neurosynth.analysis.reduce import average_within_regions

        # Load Masks by studies matrix

        # ADD FEATURE TO FILTER BY FEATURES
        masks_by_studies = average_within_regions(self.dataset, self.mask_img, threshold = self.thresh)

        study_ids = self.dataset.feature_table.data.index

        print "Loading data from neurosynth..."

        pb = tools.ProgressBar(len(list(masks_by_studies)), start=True)

        self.ids_by_masks = []
        self.data_by_masks = []
        for mask in masks_by_studies:

            m_ids = study_ids[np.where(mask == True)[0]]
            self.ids_by_masks.append(m_ids)
            self.data_by_masks.append(self.dataset.get_feature_data(ids=m_ids))
            pb.next()

        self.mask_num = masks_by_studies.shape[0]    
        self.mask_pairs = list(itertools.permutations(range(0, self.mask_num), 2))

        filename = path.join(mkdtemp(), 'c_data.dat')
        self.c_data = np.memmap(filename, dtype='object',
                                mode='w+', shape=(self.mask_num, self.mask_num))
        # Load data
        for pair in self.mask_pairs:
            reg1_ids = self.ids_by_masks[pair[0]]
            reg2_ids = self.ids_by_masks[pair[1]]

            reg1_set = list(set(reg1_ids) - set(reg2_ids))
            reg2_set = list(set(reg2_ids) - set(reg1_ids))

            x1 = self.data_by_masks[pair[0]]
            x1 = np.array(x1)[np.where(np.in1d(reg1_ids, reg1_set))[0]]

            x2 = self.data_by_masks[pair[1]]
            x2 = np.array(x2)[np.where(np.in1d(reg2_ids, reg2_set))[0]] 

            y = np.array([0]*len(reg1_set) + [1]*len(reg2_set))

            X = np.vstack((x1, x2))

            if X_threshold is not None:
                X = binarize(X, X_threshold)

            from neurosynth.analysis.classify import regularize
            X = regularize(X, method='scale')

            self.c_data[pair] = (X, y)

        if self.memsave:
            self.data_by_masks = []
            self.ids_by_masks = []
Esempio n. 4
0
def calculate_ns(clf):
    mask_pairs = list(itertools.combinations(clf.masklist, 2))
    clf.ns = np.ma.masked_array(
        np.empty((clf.mask_num, clf.mask_num, 2)), True)

    pb = tools.ProgressBar(len(list(mask_pairs)))

    for index, n in itertools.imap(get_ns_for_pairs, itertools.izip(itertools.repeat(clf.dataset), mask_pairs, itertools.repeat(clf.thresh))):
        clf.ns[index] = n
        pb.next()
Esempio n. 5
0
def bootstrap_mv_full(dataset,
                      clf,
                      scorer,
                      mask,
                      features=None,
                      processes=None,
                      boot_n=100,
                      method='combinatorial',
                      outfile=None,
                      thresh_high=0.05,
                      thresh_low=0):
    from neurosynth.analysis.reduce import average_within_regions

    if processes != 1:
        from multiprocessing import Pool
        pool = Pool(processes=processes)
    else:
        pool = itertools

    pb = tools.ProgressBar(boot_n, start=True)

    if method == 'shannons':
        cols = ['shannons', 'region', 'boot_n']
    else:
        cols = ['score', 'num_features', 'feature', 'region', 'boot_n']

    X = dataset.get_feature_data(features=features)
    y_high = average_within_regions(dataset, mask, threshold=thresh_high)
    y_low = average_within_regions(dataset, mask, threshold=thresh_low)
    ## Get feature names
    overall_results = []
    for result in pool.imap(
            bootstrap_mv_full_parallel,
            itertools.izip(
                itertools.repeat((X, y_high, y_low, clf, scorer, method)),
                range(boot_n))):
        pb.next()

        print "I exited and I'm tring to save"

        if result is not None:
            if method != 'shannons':
                for row in result:
                    overall_results.append(row)
            else:
                overall_results.append(result)

            if outfile is not None:
                pd.DataFrame(overall_results, columns=cols).to_csv(outfile)
                print "Saved"

    overall_results = pd.DataFrame(overall_results, columns=cols)
    overall_results.region += 1
    return overall_results
Esempio n. 6
0
def bootstrap_mv(clf,
                 scorer,
                 regions=None,
                 processes=None,
                 boot_n=100,
                 method='sequential',
                 outfile=None):
    """ This function calculates "complexity curves" for each region in clf. The complexity curves are done on a 
    boostrapped sample boot_n times """

    n_regions = clf.data.shape[0]
    if regions is None:
        regions = range(0, n_regions)

    if processes != 1:
        from multiprocessing import Pool
        pool = Pool(processes=processes)
    else:
        pool = itertools

    pb = tools.ProgressBar(len(regions) * boot_n, start=True)

    if method == 'shannons':
        cols = ['shannons', 'region', 'boot_n']
    else:
        cols = ['score', 'num_features', 'feature', 'region', 'boot_n']

    overall_results = []
    # Do this for every region sequentially
    for i, (X, y) in enumerate(clf.data[regions]):
        for result in pool.imap(
                bootstrap_mv_parallel,
                itertools.izip(
                    itertools.repeat(
                        (X, y, clf.classifier, scorer, clf.feature_importances,
                         np.array(clf.feature_names), method, i)),
                    range(boot_n))):
            pb.next()

            if result is not None:
                if method != 'shannons':
                    for row in result:
                        overall_results.append(row)
                else:
                    overall_results.append(result)

                if outfile is not None:
                    pd.DataFrame(overall_results, columns=cols).to_csv(outfile)

    overall_results = pd.DataFrame(overall_results, columns=cols)
    overall_results.region += 1
    return overall_results
Esempio n. 7
0
    def classify(self, features=None, scoring='accuracy', X_threshold=None, feat_select=None, processes=1, class_weight = 'auto', dummy = None):

        self.load_data(features, X_threshold)

        self.initalize_containers(features, feat_select, dummy)

        print "Classifying..."
        pb = tools.ProgressBar(len(list(self.mask_pairs)), start=True)

        if processes > 1:
            pool = Pool(processes=processes)
        else:
            pool = itertools

        try:
            filename = self.c_data.filename

            for output in pool.imap(
                classify_parallel, itertools.izip(
                    itertools.repeat((self.classifier, self.param_grid, scoring, filename, feat_select, self.mask_num, class_weight)), 
                    self.mask_pairs)):

                index = output['index']
                self.class_score[index] = output['score']
                if self.memsave is False:
                    self.fit_clfs[index] = output['clf']

                if self.param_grid:  # Just get the FIs if you used a grid
                    try:
                        self.feature_importances[index] = self.fit_clfs[
                            index].best_estimator_.coef_[0]
                    except AttributeError:
                        try:
                            self.feature_importances[index] = self.fit_clfs[
                                index].best_estimator.feature_importances_
                        except AttributeError:
                            pass
                else:
                    try:
                        self.feature_importances[
                            index] = self.fit_clfs[index].clf.coef_[0]
                    except AttributeError:
                        try:
                            self.feature_importances[index] = self.fit_clfs[
                                index].clf.feature_importances_
                        except AttributeError:
                            pass

                if feat_select:
                    self.features_selected[index] = output['features_selected']

                if dummy is not None:
                    X, y = self.c_data[index]
                    output = classify.classify(X, y, classifier=DummyClassifier(strategy=dummy), cross_val='4-Fold',
                        class_weight=class_weight, scoring=scoring, feat_select=feat_select)

                    self.dummy_score[index] = output['score']

                pb.next()
        finally:
            if processes > 1:
                pool.close()
                pool.join()

        if dummy is None:
            self.final_score = self.class_score
        else:
            self.final_score = self.class_score - self.dummy_score