def load_globals(config): import mapreduce as GLOBAL # access to global variables GLOBAL.DATA = GLOBAL.load_data(config["data"]) import brainomics.mesh_processing as mesh_utils mesh_coord, mesh_triangles = mesh_utils.mesh_arrays( config["structure"]["mesh"]) mask = np.load(config["structure"]["mask"]) GLOBAL.mesh_coord, GLOBAL.mesh_triangles, GLOBAL.mask = mesh_coord, mesh_triangles, mask A, _ = tv_helper.nesterov_linear_operator_from_mesh( GLOBAL.mesh_coord, GLOBAL.mesh_triangles, GLOBAL.mask) GLOBAL.A = A GLOBAL.CONFIG = config
def load_globals(config): import mapreduce as GLOBAL # access to global variables GLOBAL.DATA = GLOBAL.load_data(config["data"]) STRUCTURE = np.load(config["structure"]) A = tv_helper.A_from_mask(STRUCTURE) N_COMP = config["N_COMP"] GLOBAL.A, GLOBAL.STRUCTURE,GLOBAL.N_COMP = A, STRUCTURE,N_COMP mesh_coord, mesh_triangles = mesh_utils.mesh_arrays(os.path.join(TEMPLATE_PATH, "lrh.pial.gii")) mask = np.load(os.path.join(INPUT_BASE_DIR, "mask.npy")) import parsimony.functions.nesterov.tv as tv_helper Atv = tv_helper.linear_operator_from_mesh(mesh_coord, mesh_triangles, mask=mask) GLOBAL.Atv = Atv GLOBAL.FULL_RESAMPLE = config['full_resample']
import shutil BASE_PATH = "/neurospin/brainomics/2016_schizConnect/analysis/all_studies+VIP/Freesurfer/all_subjects_less_than_30years" TEMPLATE_PATH = os.path.join(BASE_PATH, "freesurfer_template") INPUT_CSV = os.path.join(BASE_PATH, "population_30yo.csv") OUTPUT = os.path.join(BASE_PATH, "data") # Read pop csv pop = pd.read_csv(INPUT_CSV) np.save( '/neurospin/brainomics/2016_schizConnect/analysis/all_studies+VIP/Freesurfer/all_subjects_less_than_30years/data/site.npy', pop["site_num"].as_matrix()) ############################################################################# ## Build mesh template import brainomics.mesh_processing as mesh_utils cor_l, tri_l = mesh_utils.mesh_arrays( os.path.join(TEMPLATE_PATH, "lh.pial.gii")) cor_r, tri_r = mesh_utils.mesh_arrays( os.path.join(TEMPLATE_PATH, "rh.pial.gii")) cor = np.vstack([cor_l, cor_r]) tri_r += cor_l.shape[0] tri = np.vstack([tri_l, tri_r]) mesh_utils.mesh_from_arrays(cor, tri, path=os.path.join(TEMPLATE_PATH, "lrh.pial.gii")) shutil.copyfile(os.path.join(TEMPLATE_PATH, "lrh.pial.gii"), os.path.join(OUTPUT, "lrh.pial.gii")) ############################################################################# # Read images n = len(pop) assert n == 280
#enttv penalty_start = 2 MASK_PATH = "/neurospin/brainomics/2016_icaar-eugei/results/Freesurfer/ICAAR+EUGEI/data/mask.npy" OUTPUT = "/neurospin/brainomics/2016_icaar-eugei/results/Freesurfer/ICAAR+EUGEI/enettv/model_selection_5folds/0.5_0.56_0.24_0.2" beta = np.load(os.path.join(OUTPUT, "beta.npz"))['arr_0'] beta, _ = array_utils.arr_threshold_from_norm2_ratio(beta, 0.99) #################################################################################### shutil.copyfile(os.path.join(TEMPLATE_PATH, "lh.pial.gii"), os.path.join(OUTPUT, "lh.pial.gii")) shutil.copyfile(os.path.join(TEMPLATE_PATH, "rh.pial.gii"), os.path.join(OUTPUT, "rh.pial.gii")) shutil.copyfile(os.path.join(TEMPLATE_PATH, "lrh.pial.gii"), os.path.join(OUTPUT, "lrh.pial.gii")) cor_l, tri_l = mesh_utils.mesh_arrays(os.path.join(OUTPUT, "lh.pial.gii")) cor_r, tri_r = mesh_utils.mesh_arrays(os.path.join(OUTPUT, "rh.pial.gii")) assert cor_l.shape[0] == cor_r.shape[0] cor_both, tri_both = mesh_utils.mesh_arrays( os.path.join(OUTPUT, "lrh.pial.gii")) mask__mesh = np.load(MASK_PATH) assert mask__mesh.shape[0] == cor_both.shape[ 0] == cor_l.shape[0] * 2 == cor_l.shape[0] + cor_r.shape[0] assert mask__mesh.shape[0], mask__mesh.sum() # Find the mapping from components in masked mesh to left_mesh and right_mesh # concat was initialy: cor = np.vstack([cor_l, cor_r]) mask_left__mesh = np.arange(mask__mesh.shape[0]) < mask__mesh.shape[0] / 2 mask_left__mesh[np.logical_not(mask__mesh)] = False mask_right__mesh = np.arange(mask__mesh.shape[0]) >= mask__mesh.shape[0] / 2
import json from brainomics import array_utils import brainomics.mesh_processing as mesh_utils import shutil BASE_PATH = "/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/results_30yo" MASK_PATH = "/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/data/30yo/mask.npy" TEMPLATE_PATH = "/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/freesurfer_template" OUTPUT = "/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/results_30yo/enettv/weight_map" shutil.copyfile(os.path.join(TEMPLATE_PATH, "lh.pial.gii"), os.path.join(OUTPUT, "lh.pial.gii")) shutil.copyfile(os.path.join(TEMPLATE_PATH, "rh.pial.gii"), os.path.join(OUTPUT, "rh.pial.gii")) cor_l, tri_l = mesh_utils.mesh_arrays(os.path.join(OUTPUT, "lh.pial.gii")) cor_r, tri_r = mesh_utils.mesh_arrays(os.path.join(OUTPUT, "rh.pial.gii")) assert cor_l.shape[0] == cor_r.shape[0] cor_both, tri_both = mesh_utils.mesh_arrays( os.path.join(OUTPUT, "lrh.pial.gii")) mask__mesh = np.load(MASK_PATH) assert mask__mesh.shape[0] == cor_both.shape[ 0] == cor_l.shape[0] * 2 == cor_l.shape[0] + cor_r.shape[0] assert mask__mesh.shape[0], mask__mesh.sum() # Find the mapping from components in masked mesh to left_mesh and right_mesh # concat was initialy: cor = np.vstack([cor_l, cor_r]) mask_left__mesh = np.arange(mask__mesh.shape[0]) < mask__mesh.shape[0] / 2 mask_left__mesh[np.logical_not(mask__mesh)] = False mask_right__mesh = np.arange(mask__mesh.shape[0]) >= mask__mesh.shape[0] / 2
mris_convert /i2bm/local/freesurfer/subjects/fsaverage/surf/rh.pial ./rh.pial.gii """ import os import numpy as np import scipy.sparse as sparse BASE_PATH = "/neurospin/brainomics/2013_adni/" TEMPLATE_PATH = os.path.join(BASE_PATH, "freesurfer_template") OUTPUT = os.path.join(BASE_PATH, "MCIc-CTL-FS") import numpy as np import brainomics.mesh_processing as mesh_utils mesh_coord, mesh_triangles = mesh_utils.mesh_arrays( os.path.join(TEMPLATE_PATH, "lrh.pial.gii")) # params mask = np.load(os.path.join(OUTPUT, "mask.npy")) import parsimony.functions.nesterov.tv as tv_helper A, _ = tv_helper.nesterov_linear_operator_from_mesh(mesh_coord, mesh_triangles, mask=mask) """ # count neighbors (arrity) for each node n_neighbors = np.array([len(n) for n in nodes_with_edges]) print np.sum(n_neighbors) print np.sum(n_neighbors) / float(len(nodes_with_edges)) print [[n, np.sum(n_neighbors == n)] for n in np.unique(n_neighbors)]
BASE_PATH = "/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST" INPUT_FS = "/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/data/freesurfer_assembled_data_fsaverage" TEMPLATE_PATH = os.path.join(BASE_PATH, "Freesurfer", "freesurfer_template") INPUT_CSV = os.path.join(BASE_PATH, "Freesurfer", "population_50yo.csv") OUTPUT = os.path.join(BASE_PATH, "Freesurfer", "data", "50yo") penalty_start = 2 # Read pop csv pop = pd.read_csv(INPUT_CSV) ############################################################################# ## Build mesh template import brainomics.mesh_processing as mesh_utils cor_l, tri_l = mesh_utils.mesh_arrays( os.path.join(TEMPLATE_PATH, "lh.pial.gii")) cor_r, tri_r = mesh_utils.mesh_arrays( os.path.join(TEMPLATE_PATH, "rh.pial.gii")) cor = np.vstack([cor_l, cor_r]) tri_r += cor_l.shape[0] tri = np.vstack([tri_l, tri_r]) mesh_utils.mesh_from_arrays(cor, tri, path=os.path.join(TEMPLATE_PATH, "lrh.pial.gii")) shutil.copyfile(os.path.join(TEMPLATE_PATH, "lrh.pial.gii"), os.path.join(OUTPUT, "lrh.pial.gii")) ############################################################################# # Read images n = len(pop) assert n == 226
os.mkdir(OUTPUT) TEMPLATE_PATH = os.path.join(BASE_PATH, "freesurfer_template") shutil.copyfile(os.path.join(TEMPLATE_PATH, "lh.pial.gii"), os.path.join(OUTPUT, "lh.pial.gii")) shutil.copyfile(os.path.join(TEMPLATE_PATH, "rh.pial.gii"), os.path.join(OUTPUT, "rh.pial.gii")) config = json.load(open("config_5cv.json")) #from soma import aims #os.path.join(OUTPUT, "lh.pial.gii") #mesh = aims.read(os.path.join(OUTPUT, "lh.pial.gii")) #mesh.header() cor_l, tri_l = mesh_utils.mesh_arrays(os.path.join(OUTPUT, "lh.pial.gii")) cor_r, tri_r = mesh_utils.mesh_arrays(os.path.join(OUTPUT, "rh.pial.gii")) assert cor_l.shape[0] == cor_r.shape[0] == 163842 cor_both, tri_both = mesh_utils.mesh_arrays(config["structure"]["mesh"]) mask__mesh = np.load(config["structure"]["mask"]) assert mask__mesh.shape[0] == cor_both.shape[0] == cor_l.shape[0] * 2 == cor_l.shape[0] + cor_r.shape[0] assert mask__mesh.shape[0], mask__mesh.sum() == (327684, 317089) # Find the mapping from beta in masked mesh to left_mesh and right_mesh # concat was initialy: cor = np.vstack([cor_l, cor_r]) mask_left__mesh = np.arange(mask__mesh.shape[0]) < mask__mesh.shape[0] / 2 mask_left__mesh[np.logical_not(mask__mesh)] = False mask_right__mesh = np.arange(mask__mesh.shape[0]) >= mask__mesh.shape[0] / 2 mask_right__mesh[np.logical_not(mask__mesh)] = False assert mask__mesh.sum() == (mask_left__mesh.sum() + mask_right__mesh.sum())
def init(): INPUT_DATA_X = os.path.join(WD_ORIGINAL, 'X.npy') INPUT_DATA_y = os.path.join(WD_ORIGINAL, 'y.npy') INPUT_MASK_PATH = os.path.join(WD_ORIGINAL, 'mask.npy') INPUT_MESH_PATH = '/neurospin/brainomics/2013_adni/MCIc-CTL-FS_cs/lrh.pial.gii' #INPUT_LINEAR_OPE_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/data/30yo/Atv.npz' # INPUT_CSV = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/Freesurfer/population_30yo.csv' os.makedirs(WD, exist_ok=True) shutil.copy(INPUT_DATA_X, WD) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) shutil.copy(INPUT_MESH_PATH, WD) #shutil.copy(INPUT_LINEAR_OPE_PATH, WD) ## Create config file os.chdir(WD) X = np.load("X.npy") y = np.load("y.npy") if not os.path.exists(os.path.join(WD, "Atv.npz")): import brainomics.mesh_processing as mesh_utils cor, tri = mesh_utils.mesh_arrays(os.path.join(WD, "lrh.pial.gii")) mask = np.load(os.path.join(WD, 'mask.npy')) import parsimony.functions.nesterov.tv as nesterov_tv from parsimony.utils.linalgs import LinearOperatorNesterov Atv = nesterov_tv.linear_operator_from_mesh(cor, tri, mask, calc_lambda_max=True) Atv.save(os.path.join(WD, "Atv.npz")) Atv_ = LinearOperatorNesterov(filename=os.path.join(WD, "Atv.npz")) assert Atv.get_singular_values(0) == Atv_.get_singular_values(0) assert np.allclose(Atv_.get_singular_values(0), 8.999, rtol=1e-03, atol=1e-03) assert np.all([a.shape == (317089, 317089) for a in Atv]) if not os.path.exists(os.path.join(WD, "beta_start.npz")): betas = dict() import time alphas = [.01, 0.1, 1.0, 10] for alpha in alphas: mod = estimators.RidgeLogisticRegression(l=alpha, class_weight="auto", penalty_start=penalty_start) t_ = time.time() mod.fit(X, y.ravel()) print(time.time() - t_) # 11564 betas["lambda_%.2f" % alpha] = mod.beta np.savez(os.path.join(WD, "beta_start.npz"), **betas) beta_start = np.load(os.path.join(WD, "beta_start.npz")) assert np.all([np.all(beta_start[a] == betas[a]) for a in beta_start.keys()]) ## Create config file # ######################################################################## # Setting 1: 5cv + large range of parameters: cv_largerange # with sub-sample training set with size 50, 100 # 5cv/cv0*[_sub50]/refit/* # sub_sizes = [50, 100] sub_sizes = [] cv_outer = [[tr, te] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42).split(np.zeros(y.shape[0]), y.ravel())] # check we got the same CV than previoulsy cv_old = json.load(open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"] cv_outer_old = [cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)]] assert np.all([np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0]) for i in range(NFOLDS_OUTER)]) assert np.all([np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1]) for i in range(NFOLDS_OUTER)]) # check END import collections cv = collections.OrderedDict() cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))] for cv_outer_i, (tr_val, te) in enumerate(cv_outer): # Simple CV cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te] # Nested CV # cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) # for cv_inner_i, (tr, val) in enumerate(cv_inner): # cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]] # Sub-sample training set with size 50, 100 # => cv*_sub[50|100]/refit grps = np.unique(y[tr_val]).astype(int) ytr = y.copy() ytr[te] = np.nan g_idx = [np.where(ytr == g)[0] for g in grps] assert np.all([np.all(ytr[g_idx[g]] == g) for g in grps]) g_size = np.array([len(g) for g in g_idx]) g_prop = g_size / g_size.sum() for sub_size in sub_sizes: # sub_size = sub_sizes[0] sub_g_size = np.round(g_prop * sub_size).astype(int) g_sub_idx = [np.random.choice(g_idx[g], sub_g_size[g], replace=False) for g in grps] assert np.all([np.all(y[g_sub_idx[g]] == g) for g in grps]) tr_val_sub = np.concatenate(g_sub_idx) assert len(tr_val_sub) == sub_size assert np.all([idx in tr_val for idx in tr_val_sub]) assert np.all(np.logical_not([idx in te for idx in tr_val_sub])) cv["cv%02d_sub%i/refit" % (cv_outer_i, sub_size)] = [tr_val_sub, te] cv = {k:[cv[k][0].tolist(), cv[k][1].tolist()] for k in cv} # Nested CV # assert len(cv_largerange) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1 # Simple CV # assert len(cv) == NFOLDS_OUTER + 1 # Simple CV + sub-sample training set with size 50, 100: assert len(cv) == NFOLDS_OUTER * (1 + len(sub_sizes)) + 1 print(list(cv.keys())) # Large grid of parameters alphas = [0.001, 0.01, 0.1, 1.0] # alphas = [.01, 0.1, 1.0] # first ran with this grid tv_ratio = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] l1l2_ratio = [0.1, 0.5, 0.9] # l1l2_ratio = [0, 0.1, 0.5, 0.9, 1.0] # first ran with this grid algos = ["enettv", "enetgn"] params_enet_tvgn = [list(param) for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio)] assert len(params_enet_tvgn) == 240 # old 300 params_enet = [list(param) for param in itertools.product(["enet"], alphas, l1l2_ratio, [0])] assert len(params_enet) == 12 # old 15 params = params_enet_tvgn + params_enet assert len(params) == 252 # 315 # Simple CV # assert len(params) * len(cv) == 1890 # Simple CV + sub-sample training set with size 50, 100: assert len(params) * len(cv) == 1512 # 5040 config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure_linear_operator_tv="Atv.npz", beta_start="beta_start.npz", map_output="5cv", user_func=user_func_filename) json.dump(config, open(os.path.join(WD, "config_cv_largerange.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils cmd = "mapreduce.py --map %s/config_cv_largerange.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd,walltime = "250:00:00", suffix="_cv_largerange", freecores=2) # ######################################################################## # Setting 2: dcv + reduced range of parameters: dcv_reducedrange # 5cv/cv0*/cvnested0*/* cv_outer = [[tr, te] for tr, te in StratifiedKFold(n_splits=NFOLDS_OUTER, random_state=42).split(np.zeros(y.shape[0]), y.ravel())] # check we got the same CV than previoulsy cv_old = json.load(open(os.path.join(WD_ORIGINAL, "config_modselectcv.json")))["resample"] cv_outer_old = [cv_old[k] for k in ['cv%02d/refit' % i for i in range(NFOLDS_OUTER)]] assert np.all([np.all(np.array(cv_outer_old[i][0]) == cv_outer[i][0]) for i in range(NFOLDS_OUTER)]) assert np.all([np.all(np.array(cv_outer_old[i][1]) == cv_outer[i][1]) for i in range(NFOLDS_OUTER)]) # check END import collections cv = collections.OrderedDict() cv["refit/refit"] = [np.arange(len(y)), np.arange(len(y))] for cv_outer_i, (tr_val, te) in enumerate(cv_outer): cv["cv%02d/refit" % (cv_outer_i)] = [tr_val, te] cv_inner = StratifiedKFold(n_splits=NFOLDS_INNER, random_state=42).split(np.zeros(y[tr_val].shape[0]), y[tr_val].ravel()) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % ((cv_outer_i), cv_inner_i)] = [tr_val[tr], tr_val[val]] cv = {k:[cv[k][0].tolist(), cv[k][1].tolist()] for k in cv} #assert len(cv) == NFOLDS_OUTER + 1 assert len(cv) == NFOLDS_OUTER * NFOLDS_INNER + NFOLDS_OUTER + 1 print(list(cv.keys())) # Reduced grid of parameters alphas = [0.001, 0.01, 0.1, 1.0] # alphas = [.01, 0.1] # original tv_ratio = [0.2, 0.8] l1l2_ratio = [0.1, 0.9] algos = ["enettv", "enetgn"] params_enet_tvgn = [list(param) for param in itertools.product(algos, alphas, l1l2_ratio, tv_ratio)] assert len(params_enet_tvgn) == 32 # 16 params_enet = [list(param) for param in itertools.product(["enet"], alphas, l1l2_ratio, [0])] assert len(params_enet) == 8 # 4 params = params_enet_tvgn + params_enet assert len(params) == 40 # 20 assert len(params) * len(cv) == 1240 # 620 config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure_linear_operator_tv="Atv.npz", beta_start="beta_start.npz", map_output="5cv", user_func=user_func_filename) json.dump(config, open(os.path.join(WD, "config_dcv_reducedrange.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils cmd = "mapreduce.py --map %s/config_dcv_reducedrange.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd,walltime = "250:00:00", suffix="_dcv_reducedrange", freecores=2)