def calc_mv(clf, scorer, regions=None, processes=7, method='sequential'): n_regions = clf.data[1].shape[0] if regions is None: regions = range(0, n_regions) if processes > 1: from multiprocessing import Pool pool = Pool(processes=processes) else: pool = itertools pb = tools.ProgressBar(len(regions), start=True) overall_results = [] for result in pool.imap( calc_mv_parallel, itertools.izip( itertools.repeat((clf.data, clf.classifier, scorer, clf.feature_importances, method)), regions)): pb.next() for row in result: overall_results.append(row) return pd.DataFrame(overall_results, columns=['score', 'num_features', 'region'])
def calc_mv_classifier(clf, scorer, regions=None, processes=7, method='sequential'): import os.path as path from tempfile import mkdtemp n_regions = clf.data.shape[0] if regions is None: regions = range(0, n_regions) if processes > 1: from multiprocessing import Pool pool = Pool(processes=processes) else: pool = itertools pb = tools.ProgressBar(len(regions), start=True) filename = path.join(mkdtemp(), 'data.dat') data = np.memmap(filename, dtype='object', mode='w+', shape=clf.comp_dims) data[:] = clf.data[:] overall_results = [] for result in pool.imap(calc_mv_parallel_classifier, itertools.izip(itertools.repeat((filename, clf.classifier, scorer, clf.comp_dims, clf.feature_importances, np.array(clf.feature_names), method)), regions)): pb.next() for row in result: overall_results.append(row) overall_results = pd.DataFrame(overall_results, columns=['score', 'num_features', 'region', 'feature']) overall_results.region += 1 return overall_results
def load_data(self, features, X_threshold): """ Load data into c_data """ from neurosynth.analysis.reduce import average_within_regions # Load Masks by studies matrix # ADD FEATURE TO FILTER BY FEATURES masks_by_studies = average_within_regions(self.dataset, self.mask_img, threshold = self.thresh) study_ids = self.dataset.feature_table.data.index print "Loading data from neurosynth..." pb = tools.ProgressBar(len(list(masks_by_studies)), start=True) self.ids_by_masks = [] self.data_by_masks = [] for mask in masks_by_studies: m_ids = study_ids[np.where(mask == True)[0]] self.ids_by_masks.append(m_ids) self.data_by_masks.append(self.dataset.get_feature_data(ids=m_ids)) pb.next() self.mask_num = masks_by_studies.shape[0] self.mask_pairs = list(itertools.permutations(range(0, self.mask_num), 2)) filename = path.join(mkdtemp(), 'c_data.dat') self.c_data = np.memmap(filename, dtype='object', mode='w+', shape=(self.mask_num, self.mask_num)) # Load data for pair in self.mask_pairs: reg1_ids = self.ids_by_masks[pair[0]] reg2_ids = self.ids_by_masks[pair[1]] reg1_set = list(set(reg1_ids) - set(reg2_ids)) reg2_set = list(set(reg2_ids) - set(reg1_ids)) x1 = self.data_by_masks[pair[0]] x1 = np.array(x1)[np.where(np.in1d(reg1_ids, reg1_set))[0]] x2 = self.data_by_masks[pair[1]] x2 = np.array(x2)[np.where(np.in1d(reg2_ids, reg2_set))[0]] y = np.array([0]*len(reg1_set) + [1]*len(reg2_set)) X = np.vstack((x1, x2)) if X_threshold is not None: X = binarize(X, X_threshold) from neurosynth.analysis.classify import regularize X = regularize(X, method='scale') self.c_data[pair] = (X, y) if self.memsave: self.data_by_masks = [] self.ids_by_masks = []
def calculate_ns(clf): mask_pairs = list(itertools.combinations(clf.masklist, 2)) clf.ns = np.ma.masked_array( np.empty((clf.mask_num, clf.mask_num, 2)), True) pb = tools.ProgressBar(len(list(mask_pairs))) for index, n in itertools.imap(get_ns_for_pairs, itertools.izip(itertools.repeat(clf.dataset), mask_pairs, itertools.repeat(clf.thresh))): clf.ns[index] = n pb.next()
def bootstrap_mv_full(dataset, clf, scorer, mask, features=None, processes=None, boot_n=100, method='combinatorial', outfile=None, thresh_high=0.05, thresh_low=0): from neurosynth.analysis.reduce import average_within_regions if processes != 1: from multiprocessing import Pool pool = Pool(processes=processes) else: pool = itertools pb = tools.ProgressBar(boot_n, start=True) if method == 'shannons': cols = ['shannons', 'region', 'boot_n'] else: cols = ['score', 'num_features', 'feature', 'region', 'boot_n'] X = dataset.get_feature_data(features=features) y_high = average_within_regions(dataset, mask, threshold=thresh_high) y_low = average_within_regions(dataset, mask, threshold=thresh_low) ## Get feature names overall_results = [] for result in pool.imap( bootstrap_mv_full_parallel, itertools.izip( itertools.repeat((X, y_high, y_low, clf, scorer, method)), range(boot_n))): pb.next() print "I exited and I'm tring to save" if result is not None: if method != 'shannons': for row in result: overall_results.append(row) else: overall_results.append(result) if outfile is not None: pd.DataFrame(overall_results, columns=cols).to_csv(outfile) print "Saved" overall_results = pd.DataFrame(overall_results, columns=cols) overall_results.region += 1 return overall_results
def bootstrap_mv(clf, scorer, regions=None, processes=None, boot_n=100, method='sequential', outfile=None): """ This function calculates "complexity curves" for each region in clf. The complexity curves are done on a boostrapped sample boot_n times """ n_regions = clf.data.shape[0] if regions is None: regions = range(0, n_regions) if processes != 1: from multiprocessing import Pool pool = Pool(processes=processes) else: pool = itertools pb = tools.ProgressBar(len(regions) * boot_n, start=True) if method == 'shannons': cols = ['shannons', 'region', 'boot_n'] else: cols = ['score', 'num_features', 'feature', 'region', 'boot_n'] overall_results = [] # Do this for every region sequentially for i, (X, y) in enumerate(clf.data[regions]): for result in pool.imap( bootstrap_mv_parallel, itertools.izip( itertools.repeat( (X, y, clf.classifier, scorer, clf.feature_importances, np.array(clf.feature_names), method, i)), range(boot_n))): pb.next() if result is not None: if method != 'shannons': for row in result: overall_results.append(row) else: overall_results.append(result) if outfile is not None: pd.DataFrame(overall_results, columns=cols).to_csv(outfile) overall_results = pd.DataFrame(overall_results, columns=cols) overall_results.region += 1 return overall_results
def classify(self, features=None, scoring='accuracy', X_threshold=None, feat_select=None, processes=1, class_weight = 'auto', dummy = None): self.load_data(features, X_threshold) self.initalize_containers(features, feat_select, dummy) print "Classifying..." pb = tools.ProgressBar(len(list(self.mask_pairs)), start=True) if processes > 1: pool = Pool(processes=processes) else: pool = itertools try: filename = self.c_data.filename for output in pool.imap( classify_parallel, itertools.izip( itertools.repeat((self.classifier, self.param_grid, scoring, filename, feat_select, self.mask_num, class_weight)), self.mask_pairs)): index = output['index'] self.class_score[index] = output['score'] if self.memsave is False: self.fit_clfs[index] = output['clf'] if self.param_grid: # Just get the FIs if you used a grid try: self.feature_importances[index] = self.fit_clfs[ index].best_estimator_.coef_[0] except AttributeError: try: self.feature_importances[index] = self.fit_clfs[ index].best_estimator.feature_importances_ except AttributeError: pass else: try: self.feature_importances[ index] = self.fit_clfs[index].clf.coef_[0] except AttributeError: try: self.feature_importances[index] = self.fit_clfs[ index].clf.feature_importances_ except AttributeError: pass if feat_select: self.features_selected[index] = output['features_selected'] if dummy is not None: X, y = self.c_data[index] output = classify.classify(X, y, classifier=DummyClassifier(strategy=dummy), cross_val='4-Fold', class_weight=class_weight, scoring=scoring, feat_select=feat_select) self.dummy_score[index] = output['score'] pb.next() finally: if processes > 1: pool.close() pool.join() if dummy is None: self.final_score = self.class_score else: self.final_score = self.class_score - self.dummy_score