params=params, resample=cv, structure="mask.nii", beta_start="beta_start.npy", structure_linear_operator_tv="Atv.npz", map_output="model_selectionCV", user_func=user_func_filename, reduce_input="results/*/*", reduce_group_by="params", reduce_output="model_selectionCV.csv") json.dump(config, open(os.path.join(WD, "config_dCV.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, _ = \ clust_utils.gabriel_make_sync_data_files(WD, wd_cluster=WD_CLUSTER) cmd = "mapreduce.py --map %s/config_dCV.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd, walltime="2500:00:00") ############################################################################# def load_globals(config): import mapreduce as GLOBAL # access to global variables GLOBAL.DATA = GLOBAL.load_data(config["data"]) GLOBAL.A = LinearOperatorNesterov( filename=config["structure_linear_operator_tv"]) GLOBAL.DATA = GLOBAL.load_data(config["data"]) GLOBAL.DIR = config["map_output"] GLOBAL.BETA_START = np.load(config["beta_start"])
config = dict(data=dict(X=INPUT_DATA_X, y=INPUT_DATA_y), params=params, resample=rndperm, mask_filename=INPUT_MASK_PATH, penalty_start = 3, map_output="rndperm", user_func=user_func_filename, #reduce_input="rndperm/*/*", reduce_group_by="params", reduce_output="ADAS11-MCIc-CTL_rndperm.csv") json.dump(config, open(os.path.join(WD, "config_rndperm.json"), "w")) ############################################################################# # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(WD) cmd = "mapreduce.py --map %s/config_rndperm.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd, suffix="_rndperm") ############################################################################# # Sync to cluster print "Sync data to gabriel.intra.cea.fr: " os.system(sync_push_filename) ############################################################################# print "# Start by running Locally with 2 cores, to check that everything os OK)" print "Interrupt after a while CTL-C" print "mapreduce.py --map %s/config_rndperm.json --ncore 2" % WD #os.system("mapreduce.py --mode map --config %s/config.json" % WD) print "# 1) Log on gabriel:" print 'ssh -t gabriel.intra.cea.fr' print "# 2) Run one Job to test" print "qsub -I"
def init(): INPUT_DATA_X = '/neurospin/brainomics/2017_asd_charles/Freesurfer/data/X.npy' INPUT_DATA_y = '/neurospin/brainomics/2017_asd_charles/Freesurfer/data/y.npy' INPUT_MASK_PATH = '/neurospin/brainomics/2017_asd_charles/Freesurfer/data/mask.npy' INPUT_LINEAR_OPE_PATH = '/neurospin/brainomics/2017_asd_charles/Freesurfer/data/Atv.npz' os.makedirs(WD, exist_ok=True) shutil.copy(INPUT_DATA_X, WD) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) shutil.copy(INPUT_LINEAR_OPE_PATH, WD) ## Create config file y = np.load(INPUT_DATA_y) X = np.load(INPUT_DATA_X) cv_outer = [[tr, te] for tr, te in StratifiedKFold( y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)] if cv_outer[0] is not None: # Make sure first fold is None cv_outer.insert(0, None) null_resampling = list() null_resampling.append(np.arange(0, len(y))), null_resampling.append( np.arange(0, len(y))) cv_outer[0] = null_resampling import collections cv = collections.OrderedDict() for cv_outer_i, (tr_val, te) in enumerate(cv_outer): if cv_outer_i == 0: cv["refit/refit"] = [tr_val, te] else: cv["cv%02d/refit" % (cv_outer_i - 1)] = [tr_val, te] cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % ((cv_outer_i - 1), cv_inner_i)] = [tr_val[tr], tr_val[val]] for k in cv: cv[k] = [cv[k][0].tolist(), cv[k][1].tolist()] print(list(cv.keys())) tv_range = [0.0, .1, 0.2, 0.3, 0.4, 0.5, .6, 0.7, .8, 0.9, 1.0] ratios = np.array([[1., 0., 1], [0., 1., 1], [.5, .5, 1], [.1, .9, 1], [0.9, 0.1, 1]]) alphas = [.1, .01, 1.0] l1l2tv = [ np.array([[float(1 - tv), float(1 - tv), tv]]) * ratios for tv in tv_range ] l1l2tv = np.concatenate(l1l2tv) alphal1l2tv = np.concatenate([ np.c_[np.array([[alpha]] * l1l2tv.shape[0]), l1l2tv] for alpha in alphas ]) # remove duplicates alphal1l2tv = pd.DataFrame(alphal1l2tv) alphal1l2tv = alphal1l2tv[~alphal1l2tv.duplicated()] alphal1l2tv.shape == (153, 4) # Remove too large l1 leading to a null soulution scaler = preprocessing.StandardScaler().fit(X) Xs = scaler.transform(X) l1max = utils.penalties.l1_max_logistic_loss(Xs[:, penalty_start:], y, mean=True, class_weight="auto") # 0.11406144805642522 alphal1l2tv = alphal1l2tv[alphal1l2tv[0] * alphal1l2tv[1] <= l1max] params = [np.round(row, 5).tolist() for row in alphal1l2tv.values.tolist()] assert pd.DataFrame(params).duplicated().sum() == 0 assert len(params) == 127 print("NB run=", len(params) * len(cv)) # 4743 => 4216 user_func_filename = "/home/ad247405/git/scripts/2017_asd_charles/03_enettv.py" config = dict(data=dict(X=os.path.basename(INPUT_DATA_X), y=os.path.basename(INPUT_DATA_y)), params=params, resample=cv, structure=os.path.basename(INPUT_MASK_PATH), structure_linear_operator_tv="Atv.npz", map_output="model_selectionCV", user_func=user_func_filename, reduce_input="results/*/*", reduce_group_by="params", reduce_output="model_selectionCV.csv") json.dump(config, open(os.path.join(WD, "config_dCV.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, _ = \ clust_utils.gabriel_make_sync_data_files(WD, wd_cluster=WD_CLUSTER) cmd = "mapreduce.py --map %s/config_dCV.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd, walltime="10000:00:00")
config_full_filename = os.path.join(full_output_dir, filename) json.dump(config, open(config_full_filename, "w")) return config ################# # Actual script # ################# if __name__ == "__main__": if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) #Retreive variables X= np.load(INPUT_DATA_X) y = np.ones(X.shape[0]) shutil.copy(INPUT_DATA_X, os.path.join(OUTPUT_DIR,"5_folds_all30yo_scz")) shutil.copy(INPUT_MASK, os.path.join(OUTPUT_DIR,"5_folds_all30yo_scz")) ############################################################################# # Create config files config_5folds = create_config(y, *(CONFIGS[0])) DEBUG = False if DEBUG: run_test(OUTPUT_DIR, config_5folds) # Build utils files: sync (push/pull) and PBS sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(os.path.join(OUTPUT_DIR,"5_folds_all30yo_scz")) cmd = "mapreduce.py --map %s/config.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(os.path.join(OUTPUT_DIR,"5_folds_all30yo_scz"), cmd,walltime = "250:00:00")
config = dict(data=dict(X='X.npy', z='z.npy'), params=params, resample=cv, structure="", map_output="results", user_func=user_func_filename, reduce_input="results/*/*", reduce_group_by="results/.*/(.*)", reduce_output="results-2.csv") json.dump(config, open(os.path.join(WD, "config.json"), "w")) ######################################################################### # Build utils files: sync (push/pull) and PBS sys.path.append(os.path.join(os.getenv('HOME'), 'gits', 'scripts')) import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(WD, user="******") cmd = "mapreduce.py -m %s/config.json --ncore 12" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd) ######################################################################### # Synchronize to cluster print "Sync data to gabriel.intra.cea.fr: " os.system(sync_push_filename) ######################################################################### print "# Map" print "mapreduce.py -m %s/config.json --ncore 12" % WD #os.system("mapreduce.py --mode map --config %s/config.json" % WD) print "# Run on cluster Gabriel" print "qsub job_Global_long.pbs" ######################################################################### print "# Reduce" print "mapreduce.py -r %s/config.json" % WD_CLUSTER
# Actual script # ################# if __name__ == "__main__": if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) #Retreive variables X = np.load(INPUT_DATA_X) y = np.ones(X.shape[0]) shutil.copy(INPUT_DATA_X, os.path.join(OUTPUT_DIR, "vbm_pcatv_all+VIP_controls_30yo")) shutil.copy(INPUT_MASK, os.path.join(OUTPUT_DIR, "vbm_pcatv_all+VIP_controls_30yo")) ############################################################################# # Create config files config_5folds = create_config(y, *(CONFIGS[0])) DEBUG = False if DEBUG: run_test(OUTPUT_DIR, config_5folds) # Build utils files: sync (push/pull) and PBS sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(os.path.join(OUTPUT_DIR,"vbm_pcatv_all+VIP_controls_30yo")) cmd = "mapreduce.py --map %s/config.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(os.path.join( OUTPUT_DIR, "vbm_pcatv_all+VIP_controls_30yo"), cmd, walltime="250:00:00")
def init(): INPUT_DATA_X = os.path.join('X.npy') INPUT_DATA_y = os.path.join('y.npy') INPUT_MASK_PATH = os.path.join("mask.nii") NFOLDS_INNER, NFOLDS_OUTER = 5, 5 #WD = os.path.join(WD, 'logistictvenet_5cv') if not os.path.exists(WD): os.makedirs(WD) os.chdir(WD) ############################################################################# ## Create config file y = np.load(INPUT_DATA_y) X = np.load(INPUT_DATA_X) from parsimony.utils.penalties import l1_max_logistic_loss assert l1_max_logistic_loss(X[:, 2:], y) == 0.18046445850741652 if os.path.exists(config_filenane()): old_conf = json.load(open(config_filenane())) cv = old_conf["resample"] else: cv_outer = [[tr, te] for tr, te in StratifiedKFold( y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)] """ cv_outer = [[np.array(tr), np.array(te)] for tr,te in json.load(open("/neurospin/brainomics/2013_adni/MCIc-CTL_cs_s/config.json", "r"))["resample"][1:]] """ import collections cv = collections.OrderedDict() for cv_outer_i, (tr_val, te) in enumerate(cv_outer): cv["cv%02d/refit" % cv_outer_i] = [tr_val, te] cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % (cv_outer_i, cv_inner_i)] = [tr_val[tr], tr_val[val]] for k in cv: cv[k] = [cv[k][0].tolist(), cv[k][1].tolist()] print cv.keys() # Some QC N = float(len(y)) p0 = np.sum(y == 0) / N p1 = np.sum(y == 1) / N for k in cv: tr, val = cv[k] tr, val = np.array(tr), np.array(val) print k, "\t: tr+val=", len(tr) + len(val) assert not set(tr).intersection(val) assert abs(np.sum(y[tr] == 0) / float(len(y[tr])) - p0) < 0.01 assert abs(np.sum(y[tr] == 1) / float(len(y[tr])) - p1) < 0.01 if k.count("refit"): te = val assert len(tr) + len(te) == len(y) assert abs(len(y[tr]) / N - (1 - 1. / NFOLDS_OUTER)) < 0.01 else: te = np.array(cv[k.split("/")[0] + "/refit"])[1] assert abs( len(y[tr]) / N - (1 - 1. / NFOLDS_OUTER) * (1 - 1. / NFOLDS_INNER)) < 0.01 assert not set(tr).intersection(te) assert not set(val).intersection(te) len(tr) + len(val) + len(te) == len(y) tv_ratios = [0., .2, .8] l1_ratios = [np.array([1., .1, .9, 1]), np.array([1., .9, .1, 1])] # [alpha, l1 l2 tv] alphas_l1l2tv = [.01, .1] alphas_l2tv = [round(alpha, 10) for alpha in 10.**np.arange(-2, 4)] k_range = [-1] l1l2tv = [ np.array([alpha, float(1 - tv), float(1 - tv), tv]) * l1_ratio for alpha in alphas_l1l2tv for tv in tv_ratios for l1_ratio in l1_ratios ] # specific case for without l1 since it supports larger penalties l2tv = [ np.array([alpha, 0., float(1 - tv), tv]) for alpha in alphas_l2tv for tv in tv_ratios ] params = l1l2tv + l2tv params = [param.tolist() + [k] for k in k_range for param in params] params = {"_".join([str(p) for p in param]): param for param in params} #assert len(params) == 30 user_func_filename = os.path.join(os.environ["HOME"], "git", "scripts", "2013_adni", "MCIc-CTL", "02_tvenet_modselectcv_cs_s.py") #print __file__, os.path.abspath(__file__) print "user_func", user_func_filename #import sys #sys.exit(0) # Use relative path from config.json config = dict( data=dict(X=INPUT_DATA_X, y=INPUT_DATA_y), params=params, resample=cv, mask_filename=INPUT_MASK_PATH, penalty_start=2, map_output="modselectcv", user_func=user_func_filename, #reduce_input="rndperm/*/*", reduce_group_by="user_defined", reduce_output="MCIc-CTL_cs_s_modselectcv.csv") json.dump(config, open(os.path.join(WD, "config_modselectcv.json"), "w")) ############################################################################# # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(WD) cmd = "mapreduce.py --map %s/config_modselectcv.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd) ############################################################################# # Sync to cluster print "Sync data to gabriel.intra.cea.fr: " os.system(sync_push_filename) ############################################################################# print "# Start by running Locally with 2 cores, to check that everything os OK)" print "Interrupt after a while CTL-C" print "mapreduce.py --map %s/config_modselectcv.json --ncore 2" % WD #os.system("mapreduce.py --mode map --config %s/config.json" % WD) print "# 1) Log on gabriel:" print 'ssh -t gabriel.intra.cea.fr' print "# 2) Run one Job to test" print "qsub -I" print "cd %s" % WD_CLUSTER print "./job_Global_long.pbs" print "# 3) Run on cluster" print "qsub job_Global_long.pbs" print "# 4) Log out and pull Pull" print "exit" print sync_pull_filename ############################################################################# print "# Reduce" print "mapreduce.py --reduce %s/config_modselectcv.json" % WD
def init(): INPUT_DATA_X = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/X.npy' INPUT_DATA_y = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/y.npy' INPUT_MASK_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/mask.nii' INPUT_LINEAR_OPE_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/Atv.npz' INPUT_CSV = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/population_30yo.csv' os.makedirs(WD, exist_ok=True) shutil.copy(INPUT_DATA_X, WD) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) shutil.copy(INPUT_LINEAR_OPE_PATH, WD) ## Create config file y = np.load(INPUT_DATA_y) X = np.load(INPUT_DATA_X) cv_outer = [[tr, te] for tr, te in StratifiedKFold( y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)] if cv_outer[0] is not None: # Make sure first fold is None cv_outer.insert(0, None) null_resampling = list() null_resampling.append(np.arange(0, len(y))), null_resampling.append( np.arange(0, len(y))) cv_outer[0] = null_resampling import collections cv = collections.OrderedDict() for cv_outer_i, (tr_val, te) in enumerate(cv_outer): if cv_outer_i == 0: cv["refit/refit"] = [tr_val, te] else: cv["cv%02d/refit" % (cv_outer_i - 1)] = [tr_val, te] cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % ((cv_outer_i - 1), cv_inner_i)] = [tr_val[tr], tr_val[val]] for k in cv: cv[k] = [cv[k][0].tolist(), cv[k][1].tolist()] print(list(cv.keys())) #grid of ols paper gn_range = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] # ratios = np.array([[1., 0., 1], [0., 1., 1], [.5, .5, 1], [.1, .9, 1], [0.9, 0.1, 1]]) # gn_range = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] # gn_range = [0.0, 0.2, 0.8, 1.0] ratios = np.array([[1., 0., 1], [0., 1., 1], [.5, .5, 1], [.1, .90, 1], [0.9, 0.1, 1], [0.2, 0.8, 1], [0.3, 0.7, 1]]) # ratios = np.array([[1., 0., 1], [0., 1., 1], [.5, .5, 1], [.1, .9, 1], [0.9, 0.1, 1]]) alphas = [.1, .01, 1.0] l1l2s = [ np.array([[float(1 - gn), float(1 - gn), gn]]) * ratios for gn in gn_range ] l1l2s = np.concatenate(l1l2s) alphal1l2s = np.concatenate([ np.c_[np.array([[alpha]] * l1l2s.shape[0]), l1l2s] for alpha in alphas ]) params = [np.round(params, 2).tolist() for params in alphal1l2s] print("NB run=", len(params) * len(cv)) user_func_filename = "/home/ed203246/git/scripts/2016_schizConnect/supervised_analysis/NUSDAST/VBM/30yo_scripts/02_enetgn_NUDAST.py" config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure="mask.nii", structure_linear_operator_tv="Atv.npz", map_output="model_selectionCV", user_func=user_func_filename, reduce_input="results/*/*", reduce_group_by="params", reduce_output="model_selectionCV.csv") json.dump(config, open(os.path.join(WD, "config_dCV.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, _ = \ clust_utils.gabriel_make_sync_data_files(WD, wd_cluster=WD_CLUSTER) cmd = "mapreduce.py --map %s/config_dCV.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd, walltime="250:00:00")
# Actual script # ################# if __name__ == "__main__": if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) #Retreive variables X = np.load(INPUT_DATA_X) y = np.ones(X.shape[0]) shutil.copy(INPUT_DATA_X, os.path.join(OUTPUT_DIR, "FS_pcatv_NMoprhCH_controls")) shutil.copy(INPUT_MASK, os.path.join(OUTPUT_DIR, "FS_pcatv_NMoprhCH_controls")) ############################################################################# # Create config files config_5folds = create_config(y, *(CONFIGS[0])) DEBUG = False if DEBUG: run_test(OUTPUT_DIR, config_5folds) # Build utils files: sync (push/pull) and PBS sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(os.path.join(OUTPUT_DIR,"FS_pcatv_NMoprhCH_controls")) cmd = "mapreduce.py --map %s/config.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(os.path.join( OUTPUT_DIR, "FS_pcatv_NMoprhCH_controls"), cmd, walltime="250:00:00")
os.makedirs(output_dir) # Copy the learning data src_datafile = os.path.join(input_dir, INPUT_STD_DATASET_FILE) shutil.copy(src_datafile, output_dir) # Copy the objects masks for i in range(N_COMP): filename = INPUT_OBJECT_MASK_FILE_FORMAT.format(o=i) src_filename = os.path.join(INPUT_MASK_DIR, filename) dst_filename = os.path.join(output_dir, filename) shutil.copy(src_filename, dst_filename) # Create files to synchronize with the cluster sync_push_filename, sync_pull_filename, CLUSTER_WD = \ clust_utils.gabriel_make_sync_data_files(output_dir, user="******") # Create config file user_func_filename = os.path.abspath(__file__) config = OrderedDict([('data', dict(X=INPUT_STD_DATASET_FILE)), ('im_shape', dice5_data.SHAPE), ('params', correct_params), ('l1_max', l1_max), ('n_comp', N_COMP), ('resample', resamplings), ('map_output', "results"), ('user_func', user_func_filename), ('ncore', 4), ('reduce_group_by', "params"), ('reduce_output', "results.csv")]) config_full_filename = os.path.join(output_dir, "config.json") json.dump(config, open(config_full_filename, "w"), **JSON_DUMP_OPT)
def init(): INPUT_DATA_X = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/X.npy' # remove covariates from data INPUT_DATA_y = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/y.npy' INPUT_MASK_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/mask.nii' INPUT_LINEAR_OPE_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/Atv.npz' NFOLDS_OUTER = 5 NFOLDS_INNER = 5 os.makedirs(WD, exist_ok=True) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) shutil.copy(INPUT_LINEAR_OPE_PATH, WD) X = np.load(INPUT_DATA_X) np.save(os.path.join(WD, "X.npy"), X[:, penalty_start:]) #start_vector=weights.RandomUniformWeights(normalise=True,seed= 40004) #np.save(os.path.join(WD,"start_vector.npy"),start_vector) y = np.load(INPUT_DATA_y) cv_outer = [[tr, te] for tr, te in StratifiedKFold( y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)] if cv_outer[0] is not None: # Make sure first fold is None cv_outer.insert(0, None) null_resampling = list() null_resampling.append(np.arange(0, len(y))), null_resampling.append( np.arange(0, len(y))) cv_outer[0] = null_resampling # import collections cv = collections.OrderedDict() for cv_outer_i, (tr_val, te) in enumerate(cv_outer): if cv_outer_i == 0: cv["all/all"] = [tr_val, te] else: cv["cv%02d/all" % (cv_outer_i - 1)] = [tr_val, te] cv_inner = StratifiedKFold(y[tr_val].ravel(), n_folds=NFOLDS_INNER, random_state=42) for cv_inner_i, (tr, val) in enumerate(cv_inner): cv["cv%02d/cvnested%02d" % ((cv_outer_i - 1), cv_inner_i)] = [tr_val[tr], tr_val[val]] for k in cv: cv[k] = [cv[k][0].tolist(), cv[k][1].tolist()] print(list(cv.keys())) params = [[0.01, 0.72, 0.08, 0.2], [0.01, 0.08, 0.72, 0.2], [0.01, 0.18, 0.02, 0.8], [0.1, 0.18, 0.02, 0.8], [0.1, 0.02, 0.18, 0.8], [0.01, 0.02, 0.18, 0.8], [0.1, 0.08, 0.72, 0.2], [0.1, 0.72, 0.08, 0.2]] assert len(params) == 8 user_func_filename = "/home/ad247405/git/scripts/2017_parsimony_settings/warm_start/no_covariates/random_start/no_warm_restart_NUDAST_30yo_VBM.py" config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure="mask.nii", start_vector=dict(start_vector="start_vector.npy"), structure_linear_operator_tv="Atv.npz", map_output="model_selectionCV", user_func=user_func_filename, reduce_input="results/*/*", reduce_group_by="params", reduce_output="model_selectionCV.csv") json.dump(config, open(os.path.join(WD, "config_dCV.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, _ = \ clust_utils.gabriel_make_sync_data_files(WD, wd_cluster=WD_CLUSTER) cmd = "mapreduce.py --map %s/config_dCV.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd, walltime="2500:00:00")
def init(): INPUT_DATA_X = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/X.npy' INPUT_DATA_y = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/y.npy' INPUT_MASK_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/mask.nii' INPUT_LINEAR_OPE_PATH = '/neurospin/brainomics/2016_schizConnect/analysis/NUSDAST/VBM/data/data_30yo/Atv.npz' NFOLDS_OUTER = 5 os.makedirs(WD, exist_ok=True) shutil.copy(INPUT_DATA_X, WD) shutil.copy(INPUT_DATA_y, WD) shutil.copy(INPUT_MASK_PATH, WD) shutil.copy(INPUT_LINEAR_OPE_PATH, WD) if not os.path.exists(os.path.join(WD, "beta_start.npz")): betas = dict() BETA_START_PATH = "/neurospin/brainomics/2017_parsimony_settings/warm_restart/NUSDAST_30yo/VBM/no_warm_restart/model_selectionCV/cv00/all" params = glob.glob(os.path.join(BETA_START_PATH, "0*")) for p in params: print(p) path = os.path.join(p, "beta.npz") beta = np.load(path) betas[os.path.basename(p)] = beta['arr_0'] np.savez(os.path.join(WD, "beta_start.npz"), **betas) beta_start = np.load(os.path.join(WD, "beta_start.npz")) assert np.all( [np.all(beta_start[a] == betas[a]) for a in beta_start.keys()]) y = np.load(INPUT_DATA_y) cv_outer = [[tr, te] for tr, te in StratifiedKFold( y.ravel(), n_folds=NFOLDS_OUTER, random_state=42)] # import collections cv = collections.OrderedDict() for cv_outer_i, (tr_val, te) in enumerate(cv_outer): cv["cv%02d/all" % (cv_outer_i)] = [tr_val, te] for k in cv: cv[k] = [cv[k][0].tolist(), cv[k][1].tolist()] print(list(cv.keys())) params = [[0.01, 0.72, 0.08, 0.2], [0.01, 0.08, 0.72, 0.2], [0.01, 0.18, 0.02, 0.8], [0.1, 0.18, 0.02, 0.8], [0.1, 0.02, 0.18, 0.8], [0.01, 0.02, 0.18, 0.8], [0.1, 0.08, 0.72, 0.2], [0.1, 0.72, 0.08, 0.2]] assert len(params) == 8 user_func_filename = "/home/ad247405/git/scripts/2017_parsimony_settings/warm_restart/NUDAST_30yo_VBM_cv00_all_as_start_vector.py" config = dict(data=dict(X="X.npy", y="y.npy"), params=params, resample=cv, structure="mask.nii", beta_start="beta_start.npz", structure_linear_operator_tv="Atv.npz", map_output="model_selectionCV", user_func=user_func_filename, reduce_input="results/*/*", reduce_group_by="params", reduce_output="model_selectionCV.csv") json.dump(config, open(os.path.join(WD, "config_dCV.json"), "w")) # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, _ = \ clust_utils.gabriel_make_sync_data_files(WD, wd_cluster=WD_CLUSTER) cmd = "mapreduce.py --map %s/config_dCV.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd, walltime="2500:00:00")
################# # Actual script # ################# if __name__ == "__main__": if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) #Retreive variables X = np.load(INPUT_DATA_X) y = np.ones(X.shape[0]) shutil.copy(INPUT_DATA_X, os.path.join(OUTPUT_DIR, "5_folds_NMoprhCH_all")) shutil.copy(INPUT_MASK, os.path.join(OUTPUT_DIR, "5_folds_NMoprhCH_all")) ############################################################################# # Create config files config_5folds = create_config(y, *(CONFIGS[0])) DEBUG = False if DEBUG: run_test(OUTPUT_DIR, config_5folds) # Build utils files: sync (push/pull) and PBS sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(os.path.join(OUTPUT_DIR,"5_folds_NMoprhCH_all")) cmd = "mapreduce.py --map %s/config.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(os.path.join( OUTPUT_DIR, "5_folds_NMoprhCH_all"), cmd, walltime="250:00:00")
################# # Actual script # ################# if __name__ == "__main__": if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) #Retreive variables X = np.load(INPUT_DATA_X) y = np.ones(X.shape[0]) shutil.copy(INPUT_DATA_X, os.path.join(OUTPUT_DIR, "5_folds_NUDAST_10comp")) shutil.copy(INPUT_MASK, os.path.join(OUTPUT_DIR, "5_folds_NUDAST_10comp")) ############################################################################# # Create config files config_5folds = create_config(y, *(CONFIGS[0])) DEBUG = False if DEBUG: run_test(OUTPUT_DIR, config_5folds) # Build utils files: sync (push/pull) and PBS sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(os.path.join(OUTPUT_DIR,"5_folds_NUDAST_10comp")) cmd = "mapreduce.py --map %s/config.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(os.path.join( OUTPUT_DIR, "5_folds_NUDAST_10comp"), cmd, walltime="250:00:00")
def init(): os.chdir(WD) INPUT_DATA_X = os.path.join('X.npy') INPUT_DATA_y = os.path.join('y.npy') INPUT_MASK_PATH = os.path.join("mask.nii") #WD = os.path.join(WD, 'logistictvenet_5cv') if not os.path.exists(WD): os.makedirs(WD) os.chdir(WD) ############################################################################# ## Create config file y = np.load(INPUT_DATA_y) if os.path.exists("config.json"): inf = open("config.json", "r") old_conf = json.load(inf) cv = old_conf["resample"] inf.close() else: cv = [[tr.tolist(), te.tolist()] for tr, te in StratifiedKFold(y.ravel(), n_folds=NFOLDS_OUTER)] if cv[0] is not None: # Make sure first fold is None cv.insert(0, None) # parameters grid # Re-run with tv_range = np.hstack([np.arange(0, 1., .1), [0.05, 0.01, 0.005, 0.001]]) ratios = np.array([[1., 0., 1], [0., 1., 1], [.5, .5, 1], [.9, .1, 1], [.1, .9, 1], [.01, .99, 1], [.001, .999, 1]]) alphas = [.01, .05, .1, .5, 1.] k_range = [100, 1000, 10000, 100000, -1] l1l2tv = [ np.array([[float(1 - tv), float(1 - tv), tv]]) * ratios for tv in tv_range ] l1l2tv.append(np.array([[0., 0., 1.]])) l1l2tv = np.concatenate(l1l2tv) alphal1l2tv = np.concatenate([ np.c_[np.array([[alpha]] * l1l2tv.shape[0]), l1l2tv] for alpha in alphas ]) alphal1l2tvk = np.concatenate([ np.c_[alphal1l2tv, np.array([[k]] * alphal1l2tv.shape[0])] for k in k_range ]) params = [params.tolist() for params in alphal1l2tvk] """ inf = open("config.json", "w") old_conf = json.load(inf) params = old_conf["params"] params.append([.1, .05, .6, .35, -1.0]) params.append([.05, .05, .6, .35, -1.0]) params.append([.01, .05, .6, .35, -1.0]) """ # User map/reduce function file: # try: # user_func_filename = os.path.abspath(__file__) # except: user_func_filename = os.path.join(os.environ["HOME"], "git", "scripts", "2013_adni", "MCIc-CTL", "02_tvenet_csi.py") #print __file__, os.path.abspath(__file__) print "user_func", user_func_filename #import sys #sys.exit(0) # Use relative path from config.json config = dict( data=dict(X=INPUT_DATA_X, y=INPUT_DATA_y), params=params, resample=cv, mask_filename=INPUT_MASK_PATH, penalty_start=3, map_output="5cv", user_func=user_func_filename, #reduce_input="rndperm/*/*", reduce_group_by="params", reduce_output=os.path.basename(results_filenane()).replace( "xlsx", "csv")) json.dump(config, open(os.path.join(WD, "config.json"), "w")) ############################################################################# # Build utils files: sync (push/pull) and PBS import brainomics.cluster_gabriel as clust_utils sync_push_filename, sync_pull_filename, WD_CLUSTER = \ clust_utils.gabriel_make_sync_data_files(WD) cmd = "mapreduce.py --map %s/config.json" % WD_CLUSTER clust_utils.gabriel_make_qsub_job_files(WD, cmd) ############################################################################# # Sync to cluster print "Sync data to gabriel.intra.cea.fr: " os.system(sync_push_filename) ############################################################################# print "# Start by running Locally with 2 cores, to check that everything os OK)" print "Interrupt after a while CTL-C" print "mapreduce.py --map %s/config.json --ncore 2" % WD #os.system("mapreduce.py --mode map --config %s/config.json" % WD) print "# 1) Log on gabriel:" print 'ssh -t gabriel.intra.cea.fr' print "# 2) Run one Job to test" print "qsub -I" print "cd %s" % WD_CLUSTER print "./job_Global_long.pbs" print "# 3) Run on cluster" print "qsub job_Global_long.pbs" print "# 4) Log out and pull Pull" print "exit" print sync_pull_filename ############################################################################# print "# Reduce" print "mapreduce.py --reduce %s/config.json" % WD