def load_data(cache): if not (cache): BMI = pd.io.parsers.read_csv(os.path.join(DATA_PATH, "BMI.csv"), index_col=0).as_matrix() # Dataframe COFOUND = [ "Subject", "Gender de Feuil2", "ImagingCentreCity", "tiv_gaser", "mean_pds" ] df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH, "1534bmi-vincent2.csv"), index_col=0) df = df[COFOUND] # Conversion dummy coding design_mat = utils.make_design_matrix(df, regressors=COFOUND).as_matrix() # Keep only subjects for which we have all data and remove the 1. column containing subject_id from the numpy array design_mat subjects_id = np.genfromtxt(os.path.join(DATA_PATH, "subjects_id.csv"), dtype=None, delimiter=',', skip_header=1) design_mat = np.delete( np.delete( design_mat, np.where( np.in1d( design_mat[:, 0], np.delete( design_mat, np.where(np.in1d(design_mat[:, 0], subjects_id)), 0))), 0), 0, 1) # Images h5file = tables.openFile(IMAGES_FILE) masked_images = bmi_utils.read_array( h5file, "/standard_mask/residualized_images_gender_center_TIV_pds" ) #images already masked print "Data loaded" # Concatenate images with covariates gender, imaging city centrr, tiv_gaser and mean pds status in order to do as though BMI had been residualized X = np.concatenate((design_mat, masked_images), axis=1) z = BMI np.save(os.path.join(SHARED_DIR, "X.npy"), X) np.save(os.path.join(SHARED_DIR, "z.npy"), z) h5file.close() print "Data saved" else: X = np.load(os.path.join(SHARED_DIR, "X.npy")) z = np.load(os.path.join(SHARED_DIR, "z.npy")) print "Data read from cache" return X, z
def load_residualized_bmi_data(cache): if not (cache): # BMI BMI = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'), index_col=0).as_matrix() # Dataframe COFOUND = [ 'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds' ] df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH, 'population.csv'), index_col=0) df = df[COFOUND] # Keep only subjects for which we have all data subjects_id = np.genfromtxt(os.path.join(DATA_PATH, 'subjects_id.csv'), dtype=None, delimiter=',', skip_header=1) clinic_data = df.loc[subjects_id] # Conversion dummy coding covar = utils.make_design_matrix(clinic_data, regressors=COFOUND).as_matrix() # Concatenate BMI and covariates # (gender, imaging city centre, tiv_gaser and mean pds status) design_mat = np.hstack((covar, BMI)) # Images h5file = tables.openFile(IMAGES_FILE) masked_images = bmi_utils.read_array( h5file, '/standard_mask/residualized_images_gender_center_TIV_pds') print "Images loaded" X = design_mat # Center & scale X skl = StandardScaler() X = skl.fit_transform(X) Y = masked_images np.save(os.path.join(SHARED_DIR, 'X.npy'), X) np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y) h5file.close() print "Data saved" else: X = np.load(os.path.join(SHARED_DIR, 'X.npy')) Y = np.load(os.path.join(SHARED_DIR, 'Y.npy')) print "Data read from cache" return X, Y
def load_residualized_bmi_data(cache): if not (cache): BMI = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'), index_col=0).as_matrix() # Dataframe COFOUNDS = [ 'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds' ] df = pd.io.parsers.read_csv(os.path.join(SHFJ_DATA_PATH, '1534bmi-vincent2.csv'), index_col=0) df = df[COFOUNDS] # Keep only subjects for whom we have all neuroimaging and genetic data subjects_id = np.genfromtxt(os.path.join(DATA_PATH, 'subjects_id.csv'), dtype=None, delimiter=',', skip_header=1) clinic_data = df.loc[subjects_id] # Conversion dummy coding covar = utils.make_design_matrix(clinic_data, regressors=COFOUNDS).as_matrix() # Load images that have already been masked h5file = tables.openFile(IMAGES_FILE) masked_images = bmi_utils.read_array( h5file, '/standard_mask/residualized_images_gender_center_TIV_pds') print "Data loaded - Processing" # Concatenate images and covariates # (gender, imaging city centre, tiv_gaser and mean pds status) # in order to do as though BMI had been residualized X = np.hstack((covar, masked_images)) z = BMI np.save(os.path.join(SHARED_DIR, 'X.npy'), X) np.save(os.path.join(SHARED_DIR, 'z.npy'), z) h5file.close() print "Data saved" else: X = np.load(os.path.join(SHARED_DIR, 'X.npy')) z = np.load(os.path.join(SHARED_DIR, 'z.npy')) print "Data read from cache" return X, z
def load_residualized_bmi_data(cache): if not(cache): # SNPs = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'SNPs.csv'), # dtype='float64', # index_col=0).as_matrix() BMI = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'), index_col=0).as_matrix() # Dataframe df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH, 'normal_group.csv'), index_col=0) COFOUND = ['Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds'] df = df[COFOUND] # Conversion dummy coding covar = utils.make_design_matrix(df, regressors=COFOUND).as_matrix() # Images h5file = tables.openFile(IMAGES_FILE) images_file = bmi_utils.read_array(h5file, "/standard_mask/residualized_images_gender_center_TIV_pds") #images already masked masked_images = images_file[???????????, :] print "Data loaded - Processing" z = BMI # Concatenate images and covariates # (gender, imaging city centre, tiv_gaser and mean pds status) # in order to do as though BMI had been residualized. X_res = np.hstack((covar, masked_images)) np.save(os.path.join(SHARED_DIR, "X_res.npy"), X_res) np.save(os.path.join(SHARED_DIR, "z.npy"), z) h5file.close() print "Data saved" else: X_res = np.load(os.path.join(SHARED_DIR, "X_res.npy")) z = np.load(os.path.join(SHARED_DIR, "z.npy")) print "Data read from cache" return X_res, z
def load_residualized_bmi_data(cache): if not(cache): # BMI BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'), sep=',', index_col=0) # Sulci features labels = np.genfromtxt(os.path.join(QC_PATH, 'sulci_df_qc.csv'), dtype=None, delimiter=',', skip_header=1, usecols=0).tolist() sulci_index = pd.Index(labels) # Sulci features sulci_df_qc = pd.io.parsers.read_csv(os.path.join(QC_PATH, 'sulci_df_qc.csv'), # usecols=[???], sep=',') # Set the new dataframe index: subjects ID in the right format sulci_df_qc = sulci_df_qc.set_index(sulci_index) # Dataframe for picking out only clinical cofounds of non interest clinical_df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH, 'population.csv'), index_col=0) # Add one cofound since sulci follows a power law clinical_df['tiv2'] = pow(clinical_df['tiv_gaser'], 2) clinical_cofounds = ['Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'tiv2', 'mean_pds'] clinical_df = clinical_df[clinical_cofounds] # Consider subjects for whom we have neuroimaging and genetic data subjects_id = np.genfromtxt(os.path.join(DATA_PATH, 'subjects_id.csv'), dtype=None, delimiter=',', skip_header=1) # Get the intersept of indices of subjects for whom we have # neuroimaging and genetic data, but also sulci features subjects_index = np.intersect1d(subjects_id, sulci_df_qc.index.values) # Check whether all these subjects are actually stored into the qc # dataframe sulci_data = sulci_df_qc.loc[subjects_index] # Keep only subjects for which we have ALL data (neuroimaging, # genetic data and sulci features) clinical_data = clinical_df.loc[subjects_index] BMI = BMI_df.loc[subjects_index] # Conversion dummy coding covar = utils.make_design_matrix(clinical_data, regressors=clinical_cofounds).as_matrix() # Center and scale covariates, but not constant regressor's column cov = covar[:, 0:-1] skl = StandardScaler() cov = skl.fit_transform(cov) # Center & scale BMI BMI = skl.fit_transform(BMI) # Center & scale sulci_data sulci_data = skl.fit_transform(sulci_data) print "Sulci_data loaded" # Constant regressor to mimick the fit intercept constant_regressor = np.ones((sulci_data.shape[0], 1)) # Concatenate sulci data, constant regressor and covariates design_mat = np.hstack((cov, constant_regressor, sulci_data)) X = design_mat z = BMI np.save(os.path.join(SHARED_DIR, 'X.npy'), X) np.save(os.path.join(SHARED_DIR, 'z.npy'), z) print "Data saved" else: X = np.load(os.path.join(SHARED_DIR, 'X.npy')) z = np.load(os.path.join(SHARED_DIR, 'z.npy')) print "Data read from cache" return X, z
def load_SNPs_bmi_data(cache): if not (cache): # BMI BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'), sep=',', index_col=0) # Sulci maximal depth sulci_depthMax_df = pd.io.parsers.read_csv(os.path.join( QC_PATH, 'sulci_depthMax_df.csv'), sep=',', index_col=0) # SNPs SNPs_df = pd.io.parsers.read_csv(os.path.join( DATA_PATH, 'BMI_associated_SNPs_measures.csv'), index_col=0) # Dataframe for picking out only clinical cofounds of non interest clinical_df = pd.io.parsers.read_csv(os.path.join( CLINIC_DATA_PATH, 'population.csv'), index_col=0) # Cofounds of non interest clinical_cofounds = [ 'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds' ] clinical_df = clinical_df[clinical_cofounds] # Get the intersept of indices of subjects for whom we have # neuroimaging and genetic data, but also robustly segmented sulci subjects_intercept = np.intersect1d(SNPs_df.index.values, BMI_df.index.values) subjects_id = np.intersect1d(subjects_intercept, sulci_depthMax_df.index.values) # Keep only subjects for which we have ALL data (neuroimaging, # genetic data and sulci features) clinical_data = clinical_df.loc[subjects_id] BMI = BMI_df.loc[subjects_id] sulci_data = sulci_depthMax_df.loc[subjects_id] # Conversion dummy coding covar = utils.make_design_matrix( clinical_data, regressors=clinical_cofounds).as_matrix() # Center and scale covariates, but not constant regressor's column cov = covar[:, 0:-1] skl = StandardScaler() cov = skl.fit_transform(cov) # Center & scale BMI BMI = skl.fit_transform(BMI) # Constant regressor to mimick the fit intercept constant_regressor = np.ones((sulci_data.shape[0], 1)) # Concatenate sulci data, constant regressor and covariates design_mat = np.hstack((cov, constant_regressor, sulci_data)) X = design_mat z = BMI np.save(os.path.join(SHARED_DIR, 'X.npy'), X) np.save(os.path.join(SHARED_DIR, 'z.npy'), z) print 'Data saved.' else: X = np.load(os.path.join(SHARED_DIR, 'X.npy')) z = np.load(os.path.join(SHARED_DIR, 'z.npy')) print 'Data read from cache.' return X, z
def load_residualized_bmi_data(cache): if not (cache): # BMI BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'), sep=',', index_col=0) # Sulci features sulci_df_qc = pd.io.parsers.read_csv(os.path.join( QC_PATH, 'sulci_df_qc.csv'), sep=',', index_col=0) # Dataframe for picking out only clinical cofounds of non interest clinical_df = pd.io.parsers.read_csv(os.path.join( CLINIC_DATA_PATH, 'clinical_data_norm-ob_groups.csv'), index_col=0) # Add one cofound since sulci follows a power law clinical_df['tiv2'] = pow(clinical_df['tiv_gaser'], 2) clinical_cofounds = [ 'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'tiv2', 'mean_pds' ] clinical_df = clinical_df[clinical_cofounds] # Get the intersept of indices of subjects for whom we have # neuroimaging and genetic data, but also sulci features subjects_index = np.intersect1d(clinical_df.index.values, sulci_df_qc.index.values) # Check whether all these subjects are actually stored into the qc # dataframe sulci_data = sulci_df_qc.loc[subjects_index] # Keep only subjects for which we have ALL data (neuroimaging, # genetic data and sulci features) clinical_data = clinical_df.loc[subjects_index] BMI = BMI_df.loc[subjects_index] # Conversion dummy coding covar = utils.make_design_matrix( clinical_data, regressors=clinical_cofounds).as_matrix() # Center and scale covariates, but not constant regressor's column cov = covar[:, 0:-1] skl = StandardScaler() cov = skl.fit_transform(cov) # Center & scale BMI BMI = skl.fit_transform(BMI) # Center & scale sulci_data sulci_data = skl.fit_transform(sulci_data) print "sulci_data loaded" # Constant regressor to mimick the fit intercept constant_regressor = np.ones((sulci_data.shape[0], 1)) # Concatenate BMI, constant regressor and covariates design_mat = np.hstack((cov, constant_regressor, BMI)) X = design_mat Y = sulci_data np.save(os.path.join(SHARED_DIR, 'X.npy'), X) np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y) print "Data saved" else: X = np.load(os.path.join(SHARED_DIR, 'X.npy')) Y = np.load(os.path.join(SHARED_DIR, 'Y.npy')) print "Data read from cache" return X, Y, sulci_df_qc
def load_residualized_bmi_data(cache): if not(cache): # BMI BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'), sep=',', index_col=0) # Sulci features sulci_df_qc = pd.io.parsers.read_csv(os.path.join(QC_PATH, 'sulci_df_qc.csv'), sep=',', index_col=0) # Extract only sulci depthMax among sulci features sulci_feature_colnames = [] for sulcus_feature in sulci_df_qc.columns.tolist(): if (sulcus_feature.find('depthMax') != -1): sulci_feature_colnames.append(sulcus_feature) sulci_depthMax_df = sulci_df_qc[sulci_feature_colnames] # Dataframe for picking out only clinical cofounds of non interest clinical_df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH, 'population.csv'), index_col=0) clinical_cofounds = ['Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds'] clinical_df = clinical_df[clinical_cofounds] # Consider subjects for whom we have neuroimaging and genetic data subjects_id = np.genfromtxt(os.path.join(DATA_PATH, 'subjects_id.csv'), dtype=None, delimiter=',', skip_header=1) # Get the intersept of indices of subjects for whom we have # neuroimaging and genetic data, but also sulci features subjects_index = np.intersect1d(subjects_id, sulci_depthMax_df.index.values) # Check whether all these subjects are actually stored into the qc # dataframe sulci_data = sulci_depthMax_df.loc[subjects_index] # Keep only subjects for which we have ALL data (neuroimaging, # genetic data and sulci features) clinical_data = clinical_df.loc[subjects_index] BMI = BMI_df.loc[subjects_index] # Conversion dummy coding covar = utils.make_design_matrix(clinical_data, regressors=clinical_cofounds).as_matrix() # Center and scale covariates, but not constant regressor's column cov = covar[:, 0:-1] skl = StandardScaler() cov = skl.fit_transform(cov) # Center & scale BMI BMI = skl.fit_transform(BMI) # Center & scale sulci_data sulci_data = skl.fit_transform(sulci_data) print "Sulci data loaded." # Constant regressor to mimick the fit intercept constant_regressor = np.ones((sulci_data.shape[0], 1)) # Concatenate BMI, constant regressor and covariates design_mat = np.hstack((cov, constant_regressor, BMI)) X = design_mat Y = sulci_data np.save(os.path.join(SHARED_DIR, 'X.npy'), X) np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y) print "Data saved." else: X = np.load(os.path.join(SHARED_DIR, 'X.npy')) Y = np.load(os.path.join(SHARED_DIR, 'Y.npy')) print "Data read from cache." return X, Y, sulci_depthMax_df
def load_fMRI_SNPs_bmi_data(cache): if not (cache): # Dataframe for picking out only clinical cofounds of non interest clinical_df = pd.io.parsers.read_csv(os.path.join( CLINIC_DATA_PATH, 'population.csv'), index_col=0) # Cofounds of non interest clinical_cofounds = [ 'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds' ] clinical_df = clinical_df[clinical_cofounds] # SNPs SNPs_df = pd.io.parsers.read_csv(os.path.join( DATA_PATH, 'BMI_associated_SNPs_measures.csv'), index_col=0) # fMRI left motor tasks masked_images = np.load( os.path.join(GCA_motor_left_PATH, 'GCA_motor_left_images.npy')) # List of all subjects who had an fMRI examination fMRI_subjects = pd.io.parsers.read_csv(os.path.join( GCA_motor_left_PATH, 'subjects_id_left_motor_fMRI.csv'), index_col=0) # Get the intersept of indices of subjects for whom we have both # genetic data and fMRI examination subjects_intercept = np.intersect1d(SNPs_df.index.values, fMRI_subjects.index.values) subjects_id = np.intersect1d(subjects_intercept, clinical_df.index.values).tolist() # Keep only subjects for whom we have both genetic data and fMRI # examination clinical_data = clinical_df.loc[subjects_id] SNPs = SNPs_df.loc[subjects_id] # Conversion dummy coding covar = utils.make_design_matrix( clinical_data, regressors=clinical_cofounds).as_matrix() # Center and scale covariates, but not constant regressor's column cov = covar[:, 0:-1] skl = StandardScaler() cov = skl.fit_transform(cov) # Constant regressor to mimick the fit intercept constant_regressor = np.ones((SNPs.shape[0], 1)) # Concatenate sulci data, constant regressor and covariates design_mat = np.hstack((cov, constant_regressor, SNPs)) X = design_mat Y = masked_images np.save(os.path.join(SHARED_DIR, 'X.npy'), X) np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y) print 'Data saved.' else: X = np.load(os.path.join(SHARED_DIR, 'X.npy')) Y = np.load(os.path.join(SHARED_DIR, 'Y.npy')) print 'Data read from cache.' return X, Y
def load_sulci_SNPs_data(cache): if not (cache): # BMI BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'), sep=',', index_col=0) # SNPs SNPs_df = pd.io.parsers.read_csv(os.path.join( DATA_PATH, 'BMI_associated_SNPs_measures.csv'), index_col=0) # Dataframe for picking out only clinical cofounds of non interest clinical_df = pd.io.parsers.read_csv(os.path.join( CLINIC_DATA_PATH, 'population.csv'), index_col=0) # Cofounds clinical_cofounds = [ 'Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds' ] clinical_df = clinical_df[clinical_cofounds] # Get the intersept of indices of subjects for whom we have # neuroimaging and genetic data subjects_id = np.intersect1d(SNPs_df.index.values, BMI_df.index.values) # Check whether all these subjects are actually stored into both # dataframes SNPs = SNPs_df.loc[subjects_id] BMI = BMI_df.loc[subjects_id] clinical_data = clinical_df.loc[subjects_id] # Conversion dummy coding covar = utils.make_design_matrix( clinical_data, regressors=clinical_cofounds).as_matrix() # Center and scale covariates, but not constant regressor's column cov = covar[:, 0:-1] skl = StandardScaler() cov = skl.fit_transform(cov) # Center & scale BMI BMI = skl.fit_transform(BMI) print 'BMI loaded.' # Constant regressor to mimick the fit intercept constant_regressor = np.ones((BMI.shape[0], 1)) # Concatenate BMI, constant regressor and covariates design_mat = np.hstack((cov, constant_regressor, BMI)) X = design_mat Y = SNPs np.save(os.path.join(SHARED_DIR, 'X.npy'), X) np.save(os.path.join(SHARED_DIR, 'Y.npy'), Y) print 'Data saved.' else: X = np.load(os.path.join(SHARED_DIR, 'X.npy')) Y = np.load(os.path.join(SHARED_DIR, 'Y.npy')) print 'Data read from cache.' return X, Y, SNPs_df
##################################################### # Construct EPAC workflow pipeline = epac.Pipe(MULMStats(), ClusterStats()) # 1st model: most of the covariables MODEL = [ 'group_sub_ctl', 'Gender', 'pds', 'Age', 'ImagingCentreCity', 'Scanner_Type', 'vol_GM', 'vol_WM', 'vol_CSF', 'TIV', 'GM_on_TIV', 'WM_on_TIV', 'CSF_on_TIV', 'VSF', 'tristesse', 'irritabilite', 'anhedonie', 'total_symptoms_dep' ] MODEL_OUT = os.path.join(OUT_DIR, "all-covariates") if not os.path.exists(MODEL_OUT): os.makedirs(MODEL_OUT) design_mat = utils.make_design_matrix(df, regressors=MODEL).as_matrix() Y = masked_images contrast = numpy.zeros(design_mat.shape[1]) contrast[0] = 1 contrast[1] = -1 isnan = numpy.isnan(design_mat) if isnan.any(): bad_subject_ind = numpy.where(isnan)[0] print "Removing subject", bad_subject_ind design_mat = numpy.delete(design_mat, bad_subject_ind, axis=0) Y = numpy.delete(Y, bad_subject_ind, axis=0) pipeline_res = pipeline.run(design_matrix=design_mat, Y=Y, mask=mask, contrast=contrast,
def load_residualized_bmi_data(cache): if not(cache): # BMI BMI_df = pd.io.parsers.read_csv(os.path.join(DATA_PATH, 'BMI.csv'), sep=',', index_col=0) # Freesurfer labels = np.genfromtxt(os.path.join(FREESURFER_PATH, 'IMAGEN_Freesurfer_data_29juil2014.csv'), dtype=None, delimiter=',', skip_header=1, usecols=1) subject_labels = [] for i, s in enumerate(labels): subject_labels.append(int(s[25:])) freesurfer_index = pd.Index(subject_labels) # Freesurfer's spreadsheet from IMAGEN database freesurfer_df = pd.io.parsers.read_csv(os.path.join(FREESURFER_PATH, 'IMAGEN_Freesurfer_data_29juil2014.csv'), sep=',', usecols=['lhCortexVol', 'rhCortexVol', 'CortexVol', 'SubCortGrayVol', 'TotalGrayVol', 'SupraTentorialVol', 'lhCorticalWhiteMatterVol', 'rhCorticalWhiteMatterVol', 'CorticalWhiteMatterVol']) # Set the new dataframe index: subjects ID in the right format freesurfer_df = freesurfer_df.set_index(freesurfer_index) # Dataframe for picking out only clinical cofounds of non interest clinical_df = pd.io.parsers.read_csv(os.path.join(CLINIC_DATA_PATH, 'population.csv'), index_col=0) # Cofounds clinical_cofounds = ['Gender de Feuil2', 'ImagingCentreCity', 'tiv_gaser', 'mean_pds'] clinical_df = clinical_df[clinical_cofounds] # Consider subjects for which we have neuroimaging and genetic data subjects_id = np.genfromtxt(os.path.join(DATA_PATH, 'subjects_id.csv'), dtype=None, delimiter=',', skip_header=1) freesurfer_data = freesurfer_df.loc[subjects_id] # Drop rows that have any NaN values freesurfer_data = freesurfer_data.dropna() # Get indices of subjects for which we have both neuroimaging and # genetic data, but also Freesurfer subcortical features index = freesurfer_data.index # Keep only subjects for which we have ALL data (neuroimaging, # genetic data, subcortical features) clinical_data = clinical_df.loc[index] BMI = BMI_df.loc[index] # Conversion dummy coding covar = utils.make_design_matrix(clinical_data, regressors=clinical_cofounds).as_matrix() # Center and scale covariates, but not constant regressor's column cov = covar[:, 0:-1] skl = StandardScaler() cov = skl.fit_transform(cov) # Center & scale sulci_data freesurfer_data = skl.fit_transform(freesurfer_data) # Center & scale BMI BMI = skl.fit_transform(BMI) # Constant regressor to mimick the fit intercept constant_regressor = np.ones((freesurfer_data.shape[0], 1)) # Concatenate BMI, constant regressor and covariates design_mat = np.hstack((cov, constant_regressor, freesurfer_data)) X = design_mat z = BMI np.save(os.path.join(SHARED_DIR, 'X.npy'), X) np.save(os.path.join(SHARED_DIR, 'z.npy'), z) print "Data saved" else: X = np.load(os.path.join(SHARED_DIR, 'X.npy')) z = np.load(os.path.join(SHARED_DIR, 'z.npy')) print "Data read from cache" return X, z