def load_response_vars(datafile, maskfile=None, vol=True): """ load response variables (of any data type)""" if fileio.file_type(datafile) == 'nifti': dat = fileio.load_nifti(datafile, vol=vol) volmask = fileio.create_mask(dat, mask=maskfile) Y = fileio.vol2vec(dat, volmask).T else: Y = fileio.load(datafile) volmask = None if fileio.file_type(datafile) == 'cifti': Y = Y.T return Y, volmask
def estimate(self, X, y, **kwargs): trbefile = kwargs.pop('trbefile', None) if trbefile is not None: batch_effects_train = fileio.load(trbefile) else: print( 'Could not find batch-effects file! Initilizing all as zeros ...' ) batch_effects_train = np.zeros([X.shape[0], 1]) self.hbr.estimate(X, y, batch_effects_train) return self
def predict(self, Xs, X=None, Y=None, **kwargs): tsbefile = kwargs.pop('tsbefile', None) if tsbefile is not None: batch_effects_test = fileio.load(tsbefile) else: print( 'Could not find batch-effects file! Initilizing all as zeros ...' ) batch_effects_test = np.zeros([Xs.shape[0], 1]) pred_type = self.configs['pred_type'] yhat, s2 = self.hbr.predict(Xs, batch_effects_test, pred=pred_type) return yhat.squeeze(), s2.squeeze()
def rerun_nm(processing_dir, log_path, memory, duration, binary=False): """ This function reruns all failed batched in processing_dir after collect_nm has identified he failed batches * Input: * processing_dir -> Full path to the processing directory * memory -> Memory requirements written as string for example 4gb or 500mb * duration -> The approximate duration of the job, a string with HH:MM:SS for example 01:01:01 written by (primarily) T Wolfers, (adapted) SM Kia """ if binary: file_extentions = '.pkl' failed_batches = fileio.load(processing_dir + 'failed_batches' + file_extentions) shape = failed_batches.shape for n in range(0, shape[0]): jobpath = failed_batches[n, 0] print(jobpath) qsub_nm(job_path=jobpath, log_path=log_path, memory=memory, duration=duration) else: file_extentions = '.txt' failed_batches = fileio.load_pd(processing_dir + 'failed_batches' + file_extentions) shape = failed_batches.shape for n in range(0, shape[0]): jobpath = failed_batches.iloc[n, 0] print(jobpath) qsub_nm(job_path=jobpath, log_path=log_path, memory=memory, duration=duration)
def predict(self, Xs, X=None, Y=None, **kwargs): tsbefile = kwargs.pop('tsbefile', None) if tsbefile is not None: batch_effects_test = fileio.load(tsbefile) else: print( 'Could not find batch-effects file! Initilizing all as zeros ...' ) batch_effects_test = np.zeros([Xs.shape[0], 1]) pred_type = self.configs['pred_type'] if self.configs['transferred'] == False: yhat, s2 = self.hbr.predict(Xs, batch_effects_test, pred=pred_type) else: raise ValueError( "This is a transferred model. Please use predict_on_new_sites function." ) return yhat.squeeze(), s2.squeeze()
def collect_nm(processing_dir, job_name, func='estimate', collect=False, binary=False, batch_size=None, outputsuffix='_estimate'): """This function checks and collects all batches. ** Input: * processing_dir -> Full path to the processing directory * collect -> If True data is checked for failed batches and collected; if False data is just checked ** Output: * Text files containing all results accross all batches the combined output written by (primarily) T Wolfers, (adapted) SM Kia """ if binary: file_extentions = '.pkl' else: file_extentions = '.txt' # detect number of subjects, batches, hyperparameters and CV batches = glob.glob(processing_dir + 'batch_*/') count = 0 batch_fail = [] if func != 'fit': file_example = [] for batch in batches: if file_example == []: file_example = glob.glob(batch + 'yhat' + outputsuffix + file_extentions) else: break if binary is False: file_example = fileio.load(file_example[0]) else: file_example = pd.read_pickle(file_example[0]) numsubjects = file_example.shape[0] batch_size = file_example.shape[1] # artificially creates files for batches that were not executed batch_dirs = glob.glob(processing_dir + 'batch_*/') batch_dirs = fileio.sort_nicely(batch_dirs) for batch in batch_dirs: filepath = glob.glob(batch + 'yhat' + outputsuffix + '*') if filepath == []: count = count+1 batch1 = glob.glob(batch + '/' + job_name + '*.sh') print(batch1) batch_fail.append(batch1) if collect is True: pRho = np.ones(batch_size) pRho = pRho.transpose() pRho = pd.Series(pRho) fileio.save(pRho, batch + 'pRho' + outputsuffix + file_extentions) Rho = np.zeros(batch_size) Rho = Rho.transpose() Rho = pd.Series(Rho) fileio.save(Rho, batch + 'Rho' + outputsuffix + file_extentions) rmse = np.zeros(batch_size) rmse = rmse.transpose() rmse = pd.Series(rmse) fileio.save(rmse, batch + 'RMSE' + outputsuffix + file_extentions) smse = np.zeros(batch_size) smse = smse.transpose() smse = pd.Series(smse) fileio.save(smse, batch + 'SMSE' + outputsuffix + file_extentions) expv = np.zeros(batch_size) expv = expv.transpose() expv = pd.Series(expv) fileio.save(expv, batch + 'EXPV' + outputsuffix + file_extentions) msll = np.zeros(batch_size) msll = msll.transpose() msll = pd.Series(msll) fileio.save(msll, batch + 'MSLL' + outputsuffix + file_extentions) yhat = np.zeros([numsubjects, batch_size]) yhat = pd.DataFrame(yhat) fileio.save(yhat, batch + 'yhat' + outputsuffix + file_extentions) ys2 = np.zeros([numsubjects, batch_size]) ys2 = pd.DataFrame(ys2) fileio.save(ys2, batch + 'ys2' + outputsuffix + file_extentions) Z = np.zeros([numsubjects, batch_size]) Z = pd.DataFrame(Z) fileio.save(Z, batch + 'Z' + outputsuffix + file_extentions) if not os.path.isdir(batch + 'Models'): os.mkdir('Models') else: # if more than 10% of yhat is nan then consider the batch as a failed batch yhat = fileio.load(filepath[0]) if np.count_nonzero(~np.isnan(yhat))/(np.prod(yhat.shape))<0.9: count = count+1 batch1 = glob.glob(batch + '/' + job_name + '*.sh') print('More than 10% nans in '+ batch1[0]) batch_fail.append(batch1) # combines all output files across batches if collect is True: pRho_filenames = glob.glob(processing_dir + 'batch_*/' + 'pRho' + outputsuffix + '*') if pRho_filenames: pRho_filenames = fileio.sort_nicely(pRho_filenames) pRho_dfs = [] for pRho_filename in pRho_filenames: pRho_dfs.append(pd.DataFrame(fileio.load(pRho_filename))) pRho_dfs = pd.concat(pRho_dfs, ignore_index=True, axis=0) fileio.save(pRho_dfs, processing_dir + 'pRho' + outputsuffix + file_extentions) del pRho_dfs Rho_filenames = glob.glob(processing_dir + 'batch_*/' + 'Rho' + outputsuffix + '*') if Rho_filenames: Rho_filenames = fileio.sort_nicely(Rho_filenames) Rho_dfs = [] for Rho_filename in Rho_filenames: Rho_dfs.append(pd.DataFrame(fileio.load(Rho_filename))) Rho_dfs = pd.concat(Rho_dfs, ignore_index=True, axis=0) fileio.save(Rho_dfs, processing_dir + 'Rho' + outputsuffix + file_extentions) del Rho_dfs Z_filenames = glob.glob(processing_dir + 'batch_*/' + 'Z' + outputsuffix + '*') if Z_filenames: Z_filenames = fileio.sort_nicely(Z_filenames) Z_dfs = [] for Z_filename in Z_filenames: Z_dfs.append(pd.DataFrame(fileio.load(Z_filename))) Z_dfs = pd.concat(Z_dfs, ignore_index=True, axis=1) fileio.save(Z_dfs, processing_dir + 'Z' + outputsuffix + file_extentions) del Z_dfs yhat_filenames = glob.glob(processing_dir + 'batch_*/' + 'yhat' + outputsuffix + '*') if yhat_filenames: yhat_filenames = fileio.sort_nicely(yhat_filenames) yhat_dfs = [] for yhat_filename in yhat_filenames: yhat_dfs.append(pd.DataFrame(fileio.load(yhat_filename))) yhat_dfs = pd.concat(yhat_dfs, ignore_index=True, axis=1) fileio.save(yhat_dfs, processing_dir + 'yhat' + outputsuffix + file_extentions) del yhat_dfs ys2_filenames = glob.glob(processing_dir + 'batch_*/' + 'ys2' + outputsuffix + '*') if ys2_filenames: ys2_filenames = fileio.sort_nicely(ys2_filenames) ys2_dfs = [] for ys2_filename in ys2_filenames: ys2_dfs.append(pd.DataFrame(fileio.load(ys2_filename))) ys2_dfs = pd.concat(ys2_dfs, ignore_index=True, axis=1) fileio.save(ys2_dfs, processing_dir + 'ys2' + outputsuffix + file_extentions) del ys2_dfs rmse_filenames = glob.glob(processing_dir + 'batch_*/' + 'RMSE' + outputsuffix + '*') if rmse_filenames: rmse_filenames = fileio.sort_nicely(rmse_filenames) rmse_dfs = [] for rmse_filename in rmse_filenames: rmse_dfs.append(pd.DataFrame(fileio.load(rmse_filename))) rmse_dfs = pd.concat(rmse_dfs, ignore_index=True, axis=0) fileio.save(rmse_dfs, processing_dir + 'RMSE' + outputsuffix + file_extentions) del rmse_dfs smse_filenames = glob.glob(processing_dir + 'batch_*/' + 'SMSE' + outputsuffix + '*') if smse_filenames: smse_filenames = fileio.sort_nicely(smse_filenames) smse_dfs = [] for smse_filename in smse_filenames: smse_dfs.append(pd.DataFrame(fileio.load(smse_filename))) smse_dfs = pd.concat(smse_dfs, ignore_index=True, axis=0) fileio.save(smse_dfs, processing_dir + 'SMSE' + outputsuffix + file_extentions) del smse_dfs expv_filenames = glob.glob(processing_dir + 'batch_*/' + 'EXPV' + outputsuffix + '*') if expv_filenames: expv_filenames = fileio.sort_nicely(expv_filenames) expv_dfs = [] for expv_filename in expv_filenames: expv_dfs.append(pd.DataFrame(fileio.load(expv_filename))) expv_dfs = pd.concat(expv_dfs, ignore_index=True, axis=0) fileio.save(expv_dfs, processing_dir + 'EXPV' + outputsuffix + file_extentions) del expv_dfs msll_filenames = glob.glob(processing_dir + 'batch_*/' + 'MSLL' + outputsuffix + '*') if msll_filenames: msll_filenames = fileio.sort_nicely(msll_filenames) msll_dfs = [] for msll_filename in msll_filenames: msll_dfs.append(pd.DataFrame(fileio.load(msll_filename))) msll_dfs = pd.concat(msll_dfs, ignore_index=True, axis=0) fileio.save(msll_dfs, processing_dir + 'MSLL' + outputsuffix + file_extentions) del msll_dfs if func != 'predict' and func != 'transfer': if not os.path.isdir(processing_dir + 'Models') and \ os.path.exists(os.path.join(batches[0], 'Models')): os.mkdir(processing_dir + 'Models') meta_filenames = glob.glob(processing_dir + 'batch_*/Models/' + 'meta_data.md') mY = [] sY = [] mX = [] sX = [] if meta_filenames: meta_filenames = fileio.sort_nicely(meta_filenames) with open(meta_filenames[0], 'rb') as file: meta_data = pickle.load(file) if meta_data['standardize']: for meta_filename in meta_filenames: mY.append(meta_data['mean_resp']) sY.append(meta_data['std_resp']) mX.append(meta_data['mean_cov']) sX.append(meta_data['std_cov']) meta_data['mean_resp'] = np.stack(mY) meta_data['std_resp'] = np.stack(sY) meta_data['mean_cov'] = np.stack(mX) meta_data['std_cov'] = np.stack(sX) with open(os.path.join(processing_dir, 'Models', 'meta_data.md'), 'wb') as file: pickle.dump(meta_data, file) batch_dirs = glob.glob(processing_dir + 'batch_*/') if batch_dirs: batch_dirs = fileio.sort_nicely(batch_dirs) for b, batch_dir in enumerate(batch_dirs): src_files = glob.glob(batch_dir + 'Models/*.pkl') if src_files: src_files = fileio.sort_nicely(src_files) for f, full_file_name in enumerate(src_files): if os.path.isfile(full_file_name): file_name = full_file_name.split('/')[-1] n = file_name.split('_') n[-1] = str(b * batch_size + f) + '.pkl' n = '_'.join(n) shutil.copy(full_file_name, processing_dir + 'Models/' + n) elif func=='fit': count = count+1 batch1 = glob.glob(batch_dir + '/' + job_name + '*.sh') print('Failed batch: ' + batch1[0]) batch_fail.append(batch1) # list batches that were not executed print('Number of batches that failed:' + str(count)) batch_fail_df = pd.DataFrame(batch_fail) if file_extentions == '.txt': fileio.save_pd(batch_fail_df, processing_dir + 'failed_batches'+ file_extentions) else: fileio.save(batch_fail_df, processing_dir + 'failed_batches' + file_extentions) if not batch_fail: return 1 else: return 0
def extend(covfile, respfile, maskfile=None, **kwargs): alg = kwargs.pop('alg') if alg != 'hbr': print('Model extention is only possible for HBR models.') return elif (not 'model_path' in list(kwargs.keys())) or \ (not 'output_path' in list(kwargs.keys())) or \ (not 'trbefile' in list(kwargs.keys())) or \ (not 'dummycovfile' in list(kwargs.keys()))or \ (not 'dummybefile' in list(kwargs.keys())): print('InputError: Some mandatory arguments are missing.') return else: model_path = kwargs.pop('model_path') output_path = kwargs.pop('output_path') trbefile = kwargs.pop('trbefile') dummycovfile = kwargs.pop('dummycovfile') dummybefile = kwargs.pop('dummybefile') informative_prior = kwargs.pop('job_id', 'False') == 'True' generation_factor = int(kwargs.pop('generation_factor', '10')) job_id = kwargs.pop('job_id', None) batch_size = kwargs.pop('batch_size', None) if batch_size is not None: batch_size = int(batch_size) job_id = int(job_id) - 1 if not os.path.isdir(output_path): os.mkdir(output_path) # load data print("Loading data ...") X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) batch_effects_train = fileio.load(trbefile) X_dummy = fileio.load(dummycovfile) batch_effects_dummy = fileio.load(dummybefile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] if len(X_dummy.shape) == 1: X_dummy = X_dummy[:, np.newaxis] feature_num = Y.shape[1] # estimate the models for all subjects for i in range(feature_num): nm = norm_init(X) if batch_size is not None: # when using nirmative_parallel print("Extending model ", job_id * batch_size + i) nm = nm.load( os.path.join(model_path, 'NM_0_' + str(job_id * batch_size + i) + '.pkl')) else: print("Extending model ", i + 1, "of", feature_num) nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) + '.pkl')) nm = nm.extend(X, Y[:, i:i + 1], batch_effects_train, X_dummy, batch_effects_dummy, samples=generation_factor, informative_prior=informative_prior) if batch_size is not None: nm.save( os.path.join(output_path, 'NM_0_' + str(job_id * batch_size + i) + '.pkl')) else: nm.save(os.path.join(output_path, 'NM_0_' + str(i) + '.pkl'))
def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None, **kwargs): ''' Transfer learning on the basis of a pre-estimated normative model by using the posterior distribution over the parameters as an informed prior for new data. currently only supported for HBR. Basic usage:: transfer(covfile, respfile [extra_arguments]) where the variables are defined below. :param covfile: test covariates used to predict the response variable :param respfile: test response variables for the normative model :param maskfile: mask used to apply to the data (nifti only) :param testcov: Test covariates :param testresp: Test responses :param model_path: Directory containing the normative model and metadata :param trbefile: Training batch effects file :param batch_size: batch size (for use with normative_parallel) :param job_id: batch id All outputs are written to disk in the same format as the input. These are: :outputs: * Yhat - predictive mean * S2 - predictive variance * Z - Z scores ''' alg = kwargs.pop('alg') if alg != 'hbr': print('Model transferring is only possible for HBR models.') return elif (not 'model_path' in list(kwargs.keys())) or \ (not 'output_path' in list(kwargs.keys())) or \ (not 'trbefile' in list(kwargs.keys())): print('InputError: Some mandatory arguments are missing.') return else: model_path = kwargs.pop('model_path') output_path = kwargs.pop('output_path') trbefile = kwargs.pop('trbefile') batch_effects_train = fileio.load(trbefile) outputsuffix = kwargs.pop('outputsuffix', '_transfer') tsbefile = kwargs.pop('tsbefile', None) job_id = kwargs.pop('job_id', None) batch_size = kwargs.pop('batch_size', None) if batch_size is not None: batch_size = int(batch_size) job_id = int(job_id) - 1 if not os.path.isdir(output_path): os.mkdir(output_path) # load data print("Loading data ...") X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] feature_num = Y.shape[1] mY = np.mean(Y, axis=0) sY = np.std(Y, axis=0) if testcov is not None: # we have a separate test dataset Xte = fileio.load(testcov) if len(Xte.shape) == 1: Xte = Xte[:, np.newaxis] ts_sample_num = Xte.shape[0] if testresp is not None: Yte, testmask = load_response_vars(testresp, maskfile) if len(Yte.shape) == 1: Yte = Yte[:, np.newaxis] else: Yte = np.zeros([ts_sample_num, feature_num]) if tsbefile is not None: batch_effects_test = fileio.load(tsbefile) else: batch_effects_test = np.zeros([Xte.shape[0], 2]) Yhat = np.zeros([ts_sample_num, feature_num]) S2 = np.zeros([ts_sample_num, feature_num]) Z = np.zeros([ts_sample_num, feature_num]) # estimate the models for all subjects for i in range(feature_num): nm = norm_init(X) if batch_size is not None: # when using normative_parallel print("Transferting model ", job_id * batch_size + i) nm = nm.load( os.path.join(model_path, 'NM_0_' + str(job_id * batch_size + i) + '.pkl')) else: print("Transferting model ", i + 1, "of", feature_num) nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) + '.pkl')) nm = nm.estimate_on_new_sites(X, Y[:, i], batch_effects_train) if batch_size is not None: nm.save( os.path.join(output_path, 'NM_0_' + str(job_id * batch_size + i) + '.pkl')) else: nm.save(os.path.join(output_path, 'NM_0_' + str(i) + '.pkl')) if testcov is not None: yhat, s2 = nm.predict_on_new_sites(Xte, batch_effects_test) Yhat[:, i] = yhat.squeeze() S2[:, i] = s2.squeeze() if testresp is None: save_results(respfile, Yhat, S2, maskvol, outputsuffix=outputsuffix) return (Yhat, S2) else: Z = (Yte - Yhat) / np.sqrt(S2) print("Evaluating the model ...") results = evaluate(Yte, Yhat, S2=S2, mY=mY, sY=sY) save_results(respfile, Yhat, S2, maskvol, Z=Z, results=results, outputsuffix=outputsuffix) return (Yhat, S2, Z)
def predict(covfile, respfile=None, maskfile=None, **kwargs): ''' Make predictions on the basis of a pre-estimated normative model If only the covariates are specified then only predicted mean and variance will be returned. If the test responses are also specified then quantities That depend on those will also be returned (Z scores and error metrics) Basic usage:: predict(covfile, [extra_arguments]) where the variables are defined below. :param covfile: test covariates used to predict the response variable :param respfile: test response variables for the normative model :param maskfile: mask used to apply to the data (nifti only) :param model_path: Directory containing the normative model and metadata :param output_path: Directory to store the results :param outputsuffix: Text string to add to the output filenames :param batch_size: batch size (for use with normative_parallel) :param job_id: batch id All outputs are written to disk in the same format as the input. These are: :outputs: * Yhat - predictive mean * S2 - predictive variance * Z - Z scores ''' model_path = kwargs.pop('model_path', 'Models') job_id = kwargs.pop('job_id', None) batch_size = kwargs.pop('batch_size', None) output_path = kwargs.pop('output_path', '') outputsuffix = kwargs.pop('outputsuffix', '_predict') if respfile is not None and not os.path.exists(respfile): print("Response file does not exist. Only returning predictions") respfile = None if not os.path.isdir(model_path): print('Models directory does not exist!') return else: if os.path.exists(os.path.join(model_path, 'meta_data.md')): with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file: meta_data = pickle.load(file) standardize = meta_data['standardize'] mY = meta_data['mean_resp'] sY = meta_data['std_resp'] mX = meta_data['mean_cov'] sX = meta_data['std_cov'] else: standardize = False if batch_size is not None: batch_size = int(batch_size) job_id = int(job_id) - 1 if (output_path != '') and (not os.path.isdir(output_path)): os.mkdir(output_path) # load data print("Loading data ...") X = fileio.load(covfile) if len(X.shape) == 1: X = X[:, np.newaxis] sample_num = X.shape[0] feature_num = len(glob.glob(os.path.join(model_path, 'NM_*.pkl'))) Yhat = np.zeros([sample_num, feature_num]) S2 = np.zeros([sample_num, feature_num]) Z = np.zeros([sample_num, feature_num]) if standardize: Xz = (X - mX[0]) / sX[0] else: Xz = X # estimate the models for all subjects for i in range(feature_num): print("Prediction by model ", i + 1, "of", feature_num) nm = norm_init(Xz) nm = nm.load( os.path.join(model_path, 'NM_' + str(0) + '_' + str(i) + '.pkl')) yhat, s2 = nm.predict(Xz, **kwargs) if standardize: Yhat[:, i] = yhat.squeeze() * sY[0][i] + mY[0][i] S2[:, i] = s2.squeeze() * sY[0][i]**2 else: Yhat[:, i] = yhat.squeeze() S2[:, i] = s2.squeeze() if respfile is None: save_results(None, Yhat, S2, None, outputsuffix=outputsuffix) return (Yhat, S2) else: Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] # warp the targets? if 'blr' in dir(nm): if nm.blr.warp is not None: warp_param = nm.blr.hyp[1:nm.blr.warp.get_n_params() + 1] Y = nm.blr.warp.f(Y, warp_param) Z = (Y - Yhat) / np.sqrt(S2) print("Evaluating the model ...") results = evaluate(Y, Yhat, S2=S2, metrics=['Rho', 'RMSE', 'SMSE', 'EXPV']) print("Evaluations Writing outputs ...") save_results(respfile, Yhat, S2, maskvol, Z=Z, outputsuffix=outputsuffix, results=results, save_path=output_path) return (Yhat, S2, Z)
def fit(covfile, respfile, **kwargs): # parse keyword arguments maskfile = kwargs.pop('maskfile', None) alg = kwargs.pop('alg', 'gpr') savemodel = kwargs.pop('savemodel', 'True') == 'True' standardize = kwargs.pop('standardize', True) if savemodel and not os.path.isdir('Models'): os.mkdir('Models') # load data print("Processing data in " + respfile) X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] # find and remove bad variables from the response variables # note: the covariates are assumed to have already been checked nz = np.where( np.bitwise_and(np.isfinite(Y).any(axis=0), np.var(Y, axis=0) != 0))[0] mean_resp = [] std_resp = [] mean_cov = [] std_cov = [] # standardize responses and covariates, ignoring invalid entries mY = np.mean(Y[:, nz], axis=0) sY = np.std(Y[:, nz], axis=0) mean_resp.append(mY) std_resp.append(sY) if standardize: Yz = np.zeros_like(Y) Yz[:, nz] = (Y[:, nz] - mY) / sY mX = np.mean(X, axis=0) sX = np.std(X, axis=0) Xz = (X - mX) / sX mean_resp.append(mY) std_resp.append(sY) mean_cov.append(mX) std_cov.append(sX) else: Yz = Y Xz = X # estimate the models for all subjects for i in range(0, len(nz)): print("Estimating model ", i + 1, "of", len(nz)) nm = norm_init(Xz, Yz[:, nz[i]], alg=alg, **kwargs) nm = nm.estimate(Xz, Yz[:, nz[i]], **kwargs) if savemodel: nm.save('Models/NM_' + str(0) + '_' + str(nz[i]) + '.pkl') if savemodel: print('Saving model meta-data...') with open('Models/meta_data.md', 'wb') as file: pickle.dump( { 'valid_voxels': nz, 'mean_resp': mean_resp, 'std_resp': std_resp, 'mean_cov': mean_cov, 'std_cov': std_cov, 'regressor': alg, 'standardize': standardize }, file) return nm
def estimate(covfile, respfile, **kwargs): """ Estimate a normative model This will estimate a model in one of two settings according to theparticular parameters specified (see below) * under k-fold cross-validation. requires respfile, covfile and cvfolds>=2 * estimating a training dataset then applying to a second test dataset. requires respfile, covfile, testcov and testresp. * estimating on a training dataset ouput of forward maps mean and se. requires respfile, covfile and testcov The models are estimated on the basis of data stored on disk in ascii or neuroimaging data formats (nifti or cifti). Ascii data should be in tab or space delimited format with the number of subjects in rows and the number of variables in columns. Neuroimaging data will be reshaped into the appropriate format Basic usage:: estimate(covfile, respfile, [extra_arguments]) where the variables are defined below. Note that either the cfolds parameter or (testcov, testresp) should be specified, but not both. :param respfile: response variables for the normative model :param covfile: covariates used to predict the response variable :param maskfile: mask used to apply to the data (nifti only) :param cvfolds: Number of cross-validation folds :param testcov: Test covariates :param testresp: Test responses :param alg: Algorithm for normative model :param configparam: Parameters controlling the estimation algorithm :param saveoutput: Save the output to disk? Otherwise returned as arrays :param outputsuffix: Text string to add to the output filenames All outputs are written to disk in the same format as the input. These are: :outputs: * yhat - predictive mean * ys2 - predictive variance * nm - normative model * Z - deviance scores * Rho - Pearson correlation between true and predicted responses * pRho - parametric p-value for this correlation * rmse - root mean squared error between true/predicted responses * smse - standardised mean squared error The outputsuffix may be useful to estimate multiple normative models in the same directory (e.g. for custom cross-validation schemes) """ # parse keyword arguments maskfile = kwargs.pop('maskfile', None) cvfolds = kwargs.pop('cvfolds', None) testcov = kwargs.pop('testcov', None) testresp = kwargs.pop('testresp', None) alg = kwargs.pop('alg', 'gpr') outputsuffix = kwargs.pop('outputsuffix', '_estimate') standardize = kwargs.pop('standardize', 'True') warp = kwargs.get('warp', None) # convert from strings if necessary if type(standardize) is str: standardize = standardize == 'True' saveoutput = kwargs.pop('saveoutput', 'True') if type(saveoutput) is str: saveoutput = saveoutput == 'True' savemodel = kwargs.pop('savemodel', 'False') if type(savemodel) is str: savemodel = savemodel == 'True' if savemodel and not os.path.isdir('Models'): os.mkdir('Models') # load data print("Processing data in " + respfile) X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] Nmod = Y.shape[1] if (testcov is not None) and (cvfolds is None): # we have a separate test dataset run_cv = False cvfolds = 1 Xte = fileio.load(testcov) if len(Xte.shape) == 1: Xte = Xte[:, np.newaxis] if testresp is not None: Yte, testmask = load_response_vars(testresp, maskfile) if len(Yte.shape) == 1: Yte = Yte[:, np.newaxis] else: sub_te = Xte.shape[0] Yte = np.zeros([sub_te, Nmod]) # treat as a single train-test split testids = range(X.shape[0], X.shape[0] + Xte.shape[0]) splits = CustomCV((range(0, X.shape[0]), ), (testids, )) Y = np.concatenate((Y, Yte), axis=0) X = np.concatenate((X, Xte), axis=0) else: run_cv = True # we are running under cross-validation splits = KFold(n_splits=cvfolds) testids = range(0, X.shape[0]) # find and remove bad variables from the response variables # note: the covariates are assumed to have already been checked nz = np.where( np.bitwise_and(np.isfinite(Y).any(axis=0), np.var(Y, axis=0) != 0))[0] # run cross-validation loop Yhat = np.zeros_like(Y) S2 = np.zeros_like(Y) Z = np.zeros_like(Y) nlZ = np.zeros((Nmod, cvfolds)) mean_resp = [] std_resp = [] mean_cov = [] std_cov = [] if warp is not None: Ywarp = np.zeros_like(Yhat) mean_resp_warp = [np.zeros(Y.shape[1]) for s in range(splits.n_splits)] std_resp_warp = [np.zeros(Y.shape[1]) for s in range(splits.n_splits)] for idx in enumerate(splits.split(X)): fold = idx[0] tr = idx[1][0] te = idx[1][1] # standardize responses and covariates, ignoring invalid entries iy, jy = np.ix_(tr, nz) mY = np.mean(Y[iy, jy], axis=0) sY = np.std(Y[iy, jy], axis=0) mean_resp.append(mY) std_resp.append(sY) if standardize: Yz = np.zeros_like(Y) Yz[:, nz] = (Y[:, nz] - mY) / sY mX = np.mean(X[tr, :], axis=0) sX = np.std(X[tr, :], axis=0) Xz = (X - mX) / sX mean_cov.append(mX) std_cov.append(sX) else: Yz = Y Xz = X # estimate the models for all subjects for i in range(0, len(nz)): print("Estimating model ", i + 1, "of", len(nz)) nm = norm_init(Xz[tr, :], Yz[tr, nz[i]], alg=alg, **kwargs) try: nm = nm.estimate(Xz[tr, :], Yz[tr, nz[i]], **kwargs) yhat, s2 = nm.predict(Xz[te, :], Xz[tr, :], Yz[tr, nz[i]], **kwargs) if savemodel: nm.save('Models/NM_' + str(fold) + '_' + str(nz[i]) + '.pkl') if standardize: Yhat[te, nz[i]] = yhat * sY[i] + mY[i] S2[te, nz[i]] = s2 * sY[i]**2 else: Yhat[te, nz[i]] = yhat S2[te, nz[i]] = s2 nlZ[nz[i], fold] = nm.neg_log_lik if (run_cv or testresp is not None): # warp the labels? if warp is not None: warp_param = nm.blr.hyp[1:nm.blr.warp.get_n_params() + 1] Ywarp[te, nz[i]] = nm.blr.warp.f(Y[te, nz[i]], warp_param) Ytest = Ywarp[te, nz[i]] # Save warped mean of the training data (for MSLL) yw = nm.blr.warp.f(Y[tr, nz[i]], warp_param) mean_resp_warp[fold][i] = np.mean(yw) std_resp_warp[fold][i] = np.std(yw) else: Ytest = Y[te, nz[i]] Z[te, nz[i]] = (Ytest - Yhat[te, nz[i]]) / \ np.sqrt(S2[te, nz[i]]) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print("Model ", i + 1, "of", len(nz), "FAILED!..skipping and writing NaN to outputs") print("Exception:") print(e) print(exc_type, fname, exc_tb.tb_lineno) Yhat[te, nz[i]] = float('nan') S2[te, nz[i]] = float('nan') nlZ[nz[i], fold] = float('nan') if testcov is None: Z[te, nz[i]] = float('nan') else: if testresp is not None: Z[te, nz[i]] = float('nan') if savemodel: print('Saving model meta-data...') with open('Models/meta_data.md', 'wb') as file: pickle.dump( { 'valid_voxels': nz, 'fold_num': cvfolds, 'mean_resp': mean_resp, 'std_resp': std_resp, 'mean_cov': mean_cov, 'std_cov': std_cov, 'regressor': alg, 'standardize': standardize }, file) # compute performance metrics if (run_cv or testresp is not None): print("Evaluating the model ...") if warp is None: results = evaluate(Y[testids, :], Yhat[testids, :], S2=S2[testids, :], mY=mean_resp[0], sY=std_resp[0]) else: results = evaluate(Ywarp[testids, :], Yhat[testids, :], S2=S2[testids, :], mY=mean_resp_warp[0], sY=std_resp_warp[0]) # Set writing options if saveoutput: if (run_cv or testresp is not None): save_results(respfile, Yhat[testids, :], S2[testids, :], maskvol, Z=Z[testids, :], results=results, outputsuffix=outputsuffix) else: save_results(respfile, Yhat[testids, :], S2[testids, :], maskvol, outputsuffix=outputsuffix) else: if (run_cv or testresp is not None): output = (Yhat[testids, :], S2[testids, :], nm, Z[testids, :], results) else: output = (Yhat[testids, :], S2[testids, :], nm) return output
def predict(covfile, respfile, maskfile=None, **kwargs): ''' Make predictions on the basis of a pre-estimated normative model If only the covariates are specified then only predicted mean and variance will be returned. If the test responses are also specified then quantities That depend on those will also be returned (Z scores and error metrics) Basic usage:: predict(covfile, [extra_arguments]) where the variables are defined below. :param covfile: test covariates used to predict the response variable :param respfile: test response variables for the normative model :param maskfile: mask used to apply to the data (nifti only) :param model_path: Directory containing the normative model and metadata. When using parallel prediction, do not pass the model path. It will be automatically decided. :param outputsuffix: Text string to add to the output filenames :param batch_size: batch size (for use with normative_parallel) :param job_id: batch id All outputs are written to disk in the same format as the input. These are: :outputs: * Yhat - predictive mean * S2 - predictive variance * Z - Z scores ''' model_path = kwargs.pop('model_path', 'Models') job_id = kwargs.pop('job_id', None) batch_size = kwargs.pop('batch_size', None) outputsuffix = kwargs.pop('outputsuffix', '_predict') inputsuffix = kwargs.pop('inputsuffix', '_estimate') alg = kwargs.pop('alg') if respfile is not None and not os.path.exists(respfile): print("Response file does not exist. Only returning predictions") respfile = None if not os.path.isdir(model_path): print('Models directory does not exist!') return else: if os.path.exists(os.path.join(model_path, 'meta_data.md')): with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file: meta_data = pickle.load(file) inscaler = meta_data['inscaler'] outscaler = meta_data['outscaler'] mY = meta_data['mean_resp'] sY = meta_data['std_resp'] scaler_cov = meta_data['scaler_cov'] scaler_resp = meta_data['scaler_resp'] meta_data = True else: print("No meta-data file is found!") inscaler = 'None' outscaler = 'None' meta_data = False if batch_size is not None: batch_size = int(batch_size) job_id = int(job_id) - 1 # load data print("Loading data ...") X = fileio.load(covfile) if len(X.shape) == 1: X = X[:, np.newaxis] sample_num = X.shape[0] feature_num = len(glob.glob(os.path.join(model_path, 'NM_*' + inputsuffix + '.pkl'))) Yhat = np.zeros([sample_num, feature_num]) S2 = np.zeros([sample_num, feature_num]) Z = np.zeros([sample_num, feature_num]) if inscaler in ['standardize', 'minmax', 'robminmax']: Xz = scaler_cov[0].transform(X) else: Xz = X # estimate the models for all subjects for i in range(feature_num): print("Prediction by model ", i+1, "of", feature_num) nm = norm_init(Xz) nm = nm.load(os.path.join(model_path, 'NM_' + str(0) + '_' + str(i) + inputsuffix + '.pkl')) if (alg!='hbr' or nm.configs['transferred']==False): yhat, s2 = nm.predict(Xz, **kwargs) else: tsbefile = kwargs.pop('tsbefile') batch_effects_test = fileio.load(tsbefile) yhat, s2 = nm.predict_on_new_sites(Xz, batch_effects_test) if outscaler == 'standardize': Yhat[:, i] = scaler_resp[0].inverse_transform(yhat, index=i) S2[:, i] = s2.squeeze() * sY[0][i]**2 elif outscaler in ['minmax', 'robminmax']: Yhat[:, i] = scaler_resp[0].inverse_transform(yhat, index=i) S2[:, i] = s2 * (scaler_resp[0].max[i] - scaler_resp[0].min[i])**2 else: Yhat[:, i] = yhat.squeeze() S2[:, i] = s2.squeeze() if respfile is None: save_results(None, Yhat, S2, None, outputsuffix=outputsuffix) return (Yhat, S2) else: Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] # warp the targets? if 'blr' in dir(nm): if nm.blr.warp is not None: warp_param = nm.blr.hyp[1:nm.blr.warp.get_n_params()+1] Y = nm.blr.warp.f(Y, warp_param) Z = (Y - Yhat) / np.sqrt(S2) print("Evaluating the model ...") if meta_data: results = evaluate(Y, Yhat, S2=S2, mY=mY[0], sY=sY[0]) else: results = evaluate(Y, Yhat, S2=S2, metrics = ['Rho', 'RMSE', 'SMSE', 'EXPV']) print("Evaluations Writing outputs ...") save_results(respfile, Yhat, S2, maskvol, Z=Z, outputsuffix=outputsuffix, results=results) return (Yhat, S2, Z)
def fit(covfile, respfile, **kwargs): # parse keyword arguments maskfile = kwargs.pop('maskfile',None) alg = kwargs.pop('alg','gpr') savemodel = kwargs.pop('savemodel','True')=='True' outputsuffix = kwargs.pop('outputsuffix','_fit') inscaler = kwargs.pop('inscaler','None') outscaler = kwargs.pop('outscaler','None') if savemodel and not os.path.isdir('Models'): os.mkdir('Models') # load data print("Processing data in " + respfile) X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] # find and remove bad variables from the response variables # note: the covariates are assumed to have already been checked nz = np.where(np.bitwise_and(np.isfinite(Y).any(axis=0), np.var(Y, axis=0) != 0))[0] scaler_resp = [] scaler_cov = [] mean_resp = [] # this is just for computing MSLL std_resp = [] # this is just for computing MSLL # standardize responses and covariates, ignoring invalid entries mY = np.mean(Y[:, nz], axis=0) sY = np.std(Y[:, nz], axis=0) mean_resp.append(mY) std_resp.append(sY) if inscaler in ['standardize', 'minmax', 'robminmax']: X_scaler = scaler(inscaler) Xz = X_scaler.fit_transform(X) scaler_cov.append(X_scaler) else: Xz = X if outscaler in ['standardize', 'minmax', 'robminmax']: Yz = np.zeros_like(Y) Y_scaler = scaler(outscaler) Yz[:, nz] = Y_scaler.fit_transform(Y[:, nz]) scaler_resp.append(Y_scaler) else: Yz = Y # estimate the models for all subjects for i in range(0, len(nz)): print("Estimating model ", i+1, "of", len(nz)) nm = norm_init(Xz, Yz[:, nz[i]], alg=alg, **kwargs) nm = nm.estimate(Xz, Yz[:, nz[i]], **kwargs) if savemodel: nm.save('Models/NM_' + str(0) + '_' + str(nz[i]) + outputsuffix + '.pkl' ) if savemodel: print('Saving model meta-data...') with open('Models/meta_data.md', 'wb') as file: pickle.dump({'valid_voxels':nz, 'mean_resp':mean_resp, 'std_resp':std_resp, 'scaler_cov':scaler_cov, 'scaler_resp':scaler_resp, 'regressor':alg, 'inscaler':inscaler, 'outscaler':outscaler}, file, protocol=PICKLE_PROTOCOL) return nm
def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None, **kwargs): alg = kwargs.pop('alg') if alg != 'hbr': print('Model transferring is only possible for HBR models.') return elif (not 'model_path' in list(kwargs.keys())) or \ (not 'output_path' in list(kwargs.keys())) or \ (not 'trbefile' in list(kwargs.keys())): print('InputError: Some mandatory arguments are missing.') return else: model_path = kwargs.pop('model_path') output_path = kwargs.pop('output_path') trbefile = kwargs.pop('trbefile') batch_effects_train = fileio.load(trbefile) outputsuffix = kwargs.pop('outputsuffix', '_transfer') tsbefile = kwargs.pop('tsbefile', None) job_id = kwargs.pop('job_id', None) batch_size = kwargs.pop('batch_size', None) if batch_size is not None: batch_size = int(batch_size) job_id = int(job_id) - 1 if not os.path.isdir(output_path): os.mkdir(output_path) # load data print("Loading data ...") X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] feature_num = Y.shape[1] mY = np.mean(Y, axis=0) sY = np.std(Y, axis=0) if testcov is not None: # we have a separate test dataset Xte = fileio.load(testcov) if len(Xte.shape) == 1: Xte = Xte[:, np.newaxis] ts_sample_num = Xte.shape[0] if testresp is not None: Yte, testmask = load_response_vars(testresp, maskfile) if len(Yte.shape) == 1: Yte = Yte[:, np.newaxis] else: Yte = np.zeros([ts_sample_num, feature_num]) if tsbefile is not None: batch_effects_test = fileio.load(tsbefile) else: batch_effects_test = np.zeros([Xte.shape[0], 2]) Yhat = np.zeros([ts_sample_num, feature_num]) S2 = np.zeros([ts_sample_num, feature_num]) Z = np.zeros([ts_sample_num, feature_num]) # estimate the models for all subjects for i in range(feature_num): nm = norm_init(X) if batch_size is not None: # when using normative_parallel print("Transferting model ", job_id * batch_size + i) nm = nm.load( os.path.join(model_path, 'NM_0_' + str(job_id * batch_size + i) + '.pkl')) else: print("Transferting model ", i + 1, "of", feature_num) nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) + '.pkl')) nm = nm.estimate_on_new_sites(X, Y[:, i], batch_effects_train) if batch_size is not None: nm.save( os.path.join(output_path, 'NM_0_' + str(job_id * batch_size + i) + '.pkl')) else: nm.save(os.path.join(output_path, 'NM_0_' + str(i) + '.pkl')) if testcov is not None: yhat, s2 = nm.predict_on_new_sites(Xte, batch_effects_test) Yhat[:, i] = yhat.squeeze() S2[:, i] = s2.squeeze() if testresp is None: save_results(respfile, Yhat, S2, maskvol, outputsuffix=outputsuffix) return (Yhat, S2) else: Z = (Yte - Yhat) / np.sqrt(S2) print("Evaluating the model ...") results = evaluate(Yte, Yhat, S2=S2, mY=mY, sY=sY) save_results(respfile, Yhat, S2, maskvol, Z=Z, results=results, outputsuffix=outputsuffix) return (Yhat, S2, Z)
def predict(covfile, respfile=None, maskfile=None, **kwargs): model_path = kwargs.pop('model_path', 'Models') job_id = kwargs.pop('job_id', None) batch_size = kwargs.pop('batch_size', None) output_path = kwargs.pop('output_path', '') outputsuffix = kwargs.pop('outputsuffix', '_predict') if respfile is not None and not os.path.exists(respfile): print("Response file does not exist. Only returning predictions") respfile = None if not os.path.isdir(model_path): print('Models directory does not exist!') return else: if os.path.exists(os.path.join(model_path, 'meta_data.md')): with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file: meta_data = pickle.load(file) standardize = meta_data['standardize'] mY = meta_data['mean_resp'] sY = meta_data['std_resp'] mX = meta_data['mean_cov'] sX = meta_data['std_cov'] else: standardize = False if batch_size is not None: batch_size = int(batch_size) job_id = int(job_id) - 1 if (output_path != '') and (not os.path.isdir(output_path)): os.mkdir(output_path) # load data print("Loading data ...") X = fileio.load(covfile) if len(X.shape) == 1: X = X[:, np.newaxis] sample_num = X.shape[0] feature_num = len(glob.glob(os.path.join(model_path, 'NM_*.pkl'))) Yhat = np.zeros([sample_num, feature_num]) S2 = np.zeros([sample_num, feature_num]) Z = np.zeros([sample_num, feature_num]) if standardize: Xz = (X - mX[0]) / sX[0] else: Xz = X # estimate the models for all subjects for i in range(feature_num): print("Prediction by model ", i + 1, "of", feature_num) nm = norm_init(Xz) nm = nm.load( os.path.join(model_path, 'NM_' + str(0) + '_' + str(i) + '.pkl')) yhat, s2 = nm.predict(Xz, **kwargs) if standardize: Yhat[:, i] = yhat.squeeze() * sY[0][i] + mY[0][i] S2[:, i] = s2.squeeze() * sY[0][i]**2 else: Yhat[:, i] = yhat.squeeze() S2[:, i] = s2.squeeze() if respfile is None: save_results(None, Yhat, S2, None, outputsuffix=outputsuffix) return (Yhat, S2) else: Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] # warp the targets? if 'blr' in dir(nm): if nm.blr.warp is not None: warp_param = nm.blr.hyp[1:nm.blr.warp.get_n_params() + 1] Y = nm.blr.warp.f(Y, warp_param) Z = (Y - Yhat) / np.sqrt(S2) print("Evaluating the model ...") results = evaluate(Y, Yhat, S2=S2, metrics=['Rho', 'RMSE', 'SMSE', 'EXPV']) print("Evaluations Writing outputs ...") save_results(respfile, Yhat, S2, maskvol, Z=Z, outputsuffix=outputsuffix, results=results, save_path=output_path) return (Yhat, S2, Z)
def rerun_nm(processing_dir, memory, duration, new_memory=False, new_duration=False, binary=False, **kwargs): """ This function reruns all failed batched in processing_dir after collect_nm has identified he failed batches * Input: * processing_dir -> Full path to the processing directory * memory -> Memory requirements written as string for example 4gb or 500mb * duration -> The approximate duration of the job, a string with HH:MM:SS for example 01:01:01 * new_memory -> If you want to change the memory you have to indicate it here. * new_duration -> If you want to change the duration you have to indicate it here. * Outputs: * Reruns failed batches. written by (primarily) T Wolfers """ log_path = kwargs.pop('log_path', None) if binary: file_extentions = '.pkl' failed_batches = fileio.load(processing_dir + 'failed_batches' + file_extentions) shape = failed_batches.shape for n in range(0, shape[0]): jobpath = failed_batches[n, 0] print(jobpath) if new_duration != False: with fileinput.FileInput(jobpath, inplace=True) as file: for line in file: print(line.replace(duration, new_duration), end='') if new_memory != False: with fileinput.FileInput(jobpath, inplace=True) as file: for line in file: print(line.replace(memory, new_memory), end='') sbatch_nm(jobpath, log_path) else: file_extentions = '.txt' failed_batches = fileio.load_pd(processing_dir + 'failed_batches' + file_extentions) shape = failed_batches.shape for n in range(0, shape[0]): jobpath = failed_batches.iloc[n, 0] print(jobpath) if new_duration != False: with fileinput.FileInput(jobpath, inplace=True) as file: for line in file: print(line.replace(duration, new_duration), end='') if new_memory != False: with fileinput.FileInput(jobpath, inplace=True) as file: for line in file: print(line.replace(memory, new_memory), end='') sbatch_nm(jobpath, log_path)