def prepare_folds(hdf5, folds, pheno, derivatives, experiment): # 创建实验数据 exps = hdf5.require_group("experiments") ids = pheno["FILE_ID"] for derivative in derivatives: # 为每个脑图谱创建一个实验数据 exp = exps.require_group( format_config(experiment, { "derivative": derivative, })) exp.attrs["derivative"] = derivative # 按照标签比例把数据分为n份,并把数据进行打散 skf = StratifiedKFold(n_splits=folds, shuffle=True) for i, (train_index, test_index) in enumerate(skf.split(ids, pheno["STRAT"])): train_index, valid_index = train_test_split(train_index, test_size=0.33) # 创建每个分组的实验数据 fold = exp.require_group(str(i)) fold['train'] = [ind.encode('utf8') for ind in ids[train_index]] fold['valid'] = [indv.encode('utf8') for indv in ids[valid_index]] fold["test"] = [indt.encode('utf8') for indt in ids[test_index]]
def load_patient(subj, tmpl): # 拼接参数获取数据地址,并读取数据 df = pd.read_csv(format_config(tmpl, { "subject": subj, }), sep="\t", header=0) df = df.apply(lambda x: pd.to_numeric(x, errors='coerce')) # 获取ROI区域编号 ROIs = [ "#" + str(y) for y in sorted([int(x[1:]) for x in df.keys().tolist()]) ] if arguments["--lstm"]: functional = np.nan_to_num(df[ROIs].to_numpy()).tolist() else: # 使用0替代无效元素,一共200行,表示200个感兴趣区域,每行一共有196个元素,表示每个感兴趣区域有196个值 functional = np.nan_to_num(df[ROIs].to_numpy().T).tolist() # axis=1表示沿着x轴数据标准化 functional = preprocessing.scale(functional, axis=1) if arguments["--lstm"]: functional = np.array(functional) else: # 计算并获得每两个ROI之间的连接性 functional = compute_connectivity(functional) functional = functional.astype(np.float32) # 返回某个病人的ROI的连接性 return subj, functional.tolist()
load_patients_to_file(hdf5, pheno, derivatives) # 构建所有的交叉验证的实验数据 if arguments["--whole"]: print ("Preparing whole dataset") prepare_folds(hdf5, folds, pheno, derivatives, experiment="{derivative}_whole") # 构建男性的交叉验证的实验数据 if arguments["--male"]: print ("Preparing male dataset") pheno_male = pheno[pheno["SEX"] == "M"] prepare_folds(hdf5, folds, pheno_male, derivatives, experiment="{derivative}_male") # 构建有阈值的交叉验证的实验数据 if arguments["--threshold"]: print ("Preparing thresholded dataset") pheno_thresh = pheno[pheno["MEAN_FD"] <= 0.2] prepare_folds(hdf5, folds, pheno_thresh, derivatives, experiment="{derivative}_threshold") # 构建每个实验室的交叉验证的实验数据 if arguments["--leave-site-out"]: print ("Preparing leave-site-out dataset") for site in pheno["SITE_ID"].unique(): pheno_without_site = pheno[pheno["SITE_ID"] != site] prepare_folds(hdf5, folds, pheno_without_site, derivatives, experiment=format_config( "{derivative}_leavesiteout-{site}", { "site": site, }) )
pheno_male = pheno[pheno["SEX"] == "M"] prepare_folds(hdf5, folds, pheno_male, derivatives, experiment="{derivative}_male") # 构建有阈值的交叉验证的实验数据 if arguments["--threshold"]: print("Preparing thresholded dataset") pheno_thresh = pheno[pheno["MEAN_FD"] <= 0.2] prepare_folds(hdf5, folds, pheno_thresh, derivatives, experiment="{derivative}_threshold") # 构建每个实验室的交叉验证的实验数据 if arguments["--leave-site-out"]: print("Preparing leave-site-out dataset") for site in pheno["SITE_ID"].unique(): pheno_without_site = pheno[pheno["SITE_ID"] != site] prepare_folds(hdf5, folds, pheno_without_site, derivatives, experiment=format_config( "{derivative}_leavesiteout-{site}", { "site": site, }))
# 脑图谱的选择 valid_derivatives = ["cc200", "aal", "ez", "ho", "tt", "dosenbach160"] derivatives = [ derivative for derivative in arguments["<derivative>"] if derivative in valid_derivatives ] # 标记实现数据 experiments = [] for derivative in derivatives: config = {"derivative": derivative} if arguments["--whole"]: experiments += [ PrepareUtils.format_config("{derivative}_whole", config) ], if arguments["--male"]: experiments += [ PrepareUtils.format_config("{derivative}_male", config) ] if arguments["--threshold"]: experiments += [ PrepareUtils.format_config("{derivative}_threshold", config) ] if arguments["--leave-site-out"]: for site in pheno["SITE_ID"].unique(): site_config = {"site": site}