def run_script(input_dir, output_dir): """ Run the commandline script for MFSDA. Args: input_dir (str): full path to the data folder output_dir (str): full path to the output folder """ """+++++++++++++++++++++++++++++++++++""" """Step 1. load dataset """ print("loading data ......") print("+++++++Read the surface shape data+++++++") shape_file_name = input_dir + "aligned_shapes.mat" mat = loadmat(shape_file_name) y_design = mat['aligned_shape'] n, l, m = y_design.shape print("The dimension of shape matrix is " + str(y_design.shape)) print("+++++++Read the sphere coordinate data+++++++") template_file_name = input_dir + "template.mat" mat = loadmat(template_file_name) coord_mat = mat['template'] # d = coord_mat.shape[1] print("+++++++Read the design matrix+++++++") design_data_file_name = input_dir + "design_data.txt" design_data_tmp = np.loadtxt(design_data_file_name, delimiter=delimiter) if len(design_data_tmp.shape) == 1: design_data = np.reshape(design_data_tmp, (design_data_tmp.shape[0], 1)) else: design_data = design_data_tmp # read the covariate type var_type_file_name = input_dir + "var_type.txt" var_type = np.loadtxt(var_type_file_name) print("+++++++Construct the design matrix: normalization+++++++") x_design = read_x(design_data, var_type) p = x_design.shape[1] print("The dimension of design matrix is " + str(x_design.shape)) """+++++++++++++++++++++++++++++++++++""" """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing""" gpvals, lpvals_fdr, clu_pvals, efit_beta, efity_design, efit_eta = mfsda.run_stats(y_design, coord_mat, design_data, var_type) """+++++++++++++++++++++++++++++++++++""" """Step3. Save all the results""" gpvals_file_name = output_dir + "global_pvalue.txt" np.savetxt(gpvals_file_name, gpvals) lpvals_fdr_file_name = output_dir + "local_pvalue_fdr.txt" np.savetxt(lpvals_fdr_file_name, lpvals_fdr) clu_pvals_file_name = output_dir + "cluster_pvalue.txt" np.savetxt(clu_pvals_file_name, clu_pvals)
def run_stats(y_design, coord_mat, design_data, var_type): n, l, m = y_design.shape print("+++++++Construct the design matrix: normalization+++++++") x_design = read_x(design_data, var_type) p = x_design.shape[1] print("The dimension of design matrix is ", str(x_design.shape)) """+++++++++++++++++++++++++++++++++++""" """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing""" print("+++++++Local linear kernel smoothing+++++++") start = timeit.default_timer() efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design) stop = timeit.default_timer() delta_time = str(stop - start) # print(h_opt) print("Elapsed time is " + delta_time) print( "+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++" ) start = timeit.default_timer() resy_design = y_design - efity_design print(np.amax(resy_design)) print(np.amin(resy_design)) efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt) print(np.amax(res_eta)) print(np.amin(res_eta)) stop = timeit.default_timer() delta_time = str(stop - start) print("Elapsed time is " + delta_time) print("+++++++Hypothesis testing+++++++") # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d start = timeit.default_timer() lpvals = np.zeros((l, p - 1)) lpvals_fdr = np.zeros((l, p - 1)) gpvals = np.zeros((1, p - 1)) clu_pvals = np.zeros((1, p - 1)) areas = np.zeros((1, p - 1)) num_bstrp = 500 # number of bootstrap samples thres = 2 for pp in range(p - 1): print("Testing whether the covariate " + str(pp + 1) + " is zero or not...") """ local and global statistics calculation """ cdesign = np.zeros((1, p)) cdesign[0, pp + 1] = 1 gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign) lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m)) lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1] ind_thres = -np.log10(lpvals[:, pp]) >= thres area = np.sum(ind_thres) """ Generate random samples and calculate the corresponding statistics and pvalues """ gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign, gstat, num_bstrp, thres, area) gpvals[0, pp] = gpval areas[0, pp] = area clu_pvals[0, pp] = clu_pval print("the global p-value for covariate " + str(pp + 1) + " is " + str(gpvals[0, pp]) + "...") print("the p-value of most significant subregion for covariate " + str(pp + 1) + " is " + str(clu_pvals[0, pp]) + "...") stop = timeit.default_timer() delta_time = str(stop - start) print("Elapsed time is " + delta_time) return gpvals, lpvals_fdr, clu_pvals, efit_beta, efity_design, efit_eta
def run_script(input_dir, output_dir): """ Run the commandline script for MFSDA. Args: input_dir (str): full path to the data folder output_dir (str): full path to the output folder """ """+++++++++++++++++++++++++++++++++++""" """Step 1. load dataset """ print("loading data ......") print("+++++++Read the surface shape data+++++++") shape_file_name = input_dir + "aligned_shapes.mat" mat = loadmat(shape_file_name) y_design = mat['aligned_shape'] n, l, m = y_design.shape print("The dimension of shape matrix is " + str(y_design.shape)) print("+++++++Read the sphere coordinate data+++++++") template_file_name = input_dir + "template.mat" mat = loadmat(template_file_name) coord_mat = mat['template'] # d = coord_mat.shape[1] print("+++++++Read the design matrix+++++++") design_data_file_name = input_dir + "design_data.txt" design_data = np.loadtxt(design_data_file_name) # read the covariate type var_type_file_name = input_dir + "var_type.txt" var_type = np.loadtxt(var_type_file_name) print("+++++++Construct the design matrix: normalization+++++++") x_design = read_x(design_data, var_type) p = x_design.shape[1] print("The dimension of design matrix is ", str(x_design.shape)) """+++++++++++++++++++++++++++++++++++""" """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing""" print("+++++++Local linear kernel smoothing+++++++") start = timeit.default_timer() efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design) stop = timeit.default_timer() delta_time = str(stop - start) # print(h_opt) print("Elapsed time is " + delta_time) print("+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++") start = timeit.default_timer() resy_design = y_design - efity_design print(np.amax(resy_design)) print(np.amin(resy_design)) efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt) print(np.amax(res_eta)) print(np.amin(res_eta)) stop = timeit.default_timer() delta_time = str(stop - start) print("Elapsed time is " + delta_time) print("+++++++Hypothesis testing+++++++") # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d start = timeit.default_timer() lpvals = np.zeros((l, p-1)) lpvals_fdr = np.zeros((l, p-1)) gpvals = np.zeros((1, p-1)) clu_pvals = np.zeros((1, p-1)) areas = np.zeros((1, p-1)) num_bstrp = 500 # number of bootstrap samples thres = 2 for pp in range(p-1): print("Testing whether the covariate " + str(pp+1) + " is zero or not...") """ local and global statistics calculation """ cdesign = np.zeros((1, p)) cdesign[0, pp+1] = 1 gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign) lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m)) lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1] ind_thres = -np.log10(lpvals[:, pp]) >= thres area = np.sum(ind_thres) """ Generate random samples and calculate the corresponding statistics and pvalues """ gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign, gstat, num_bstrp, thres, area) gpvals[0, pp] = gpval areas[0, pp] = area clu_pvals[0, pp] = clu_pval print("the global p-value for covariate " + str(pp+1) + " is " + str(gpvals[0, pp]) + "...") print("the p-value of most significant subregion for covariate " + str(pp+1) + " is " + str(clu_pvals[0, pp]) + "...") stop = timeit.default_timer() delta_time = str(stop - start) print("Elapsed time is " + delta_time) """+++++++++++++++++++++++++++++++++++""" """Step3. Save all the results""" gpvals_file_name = output_dir + "global_pvalue.txt" np.savetxt(gpvals_file_name, gpvals) lpvals_fdr_file_name = output_dir + "local_pvalue_fdr.txt" np.savetxt(lpvals_fdr_file_name, lpvals_fdr) clu_pvals_file_name = output_dir + "cluster_pvalue.txt" np.savetxt(clu_pvals_file_name, clu_pvals)
def run_script(input_dir, output_dir): """ Run the commandline script for FGWAS. :param input_dir (str): full path to the data folder output_dir (str): full path to the output folder """ """+++++++++++++++++++++++++++++++++++""" print(""" Step 0. load dataset """) print("+++++++Read the imaging data+++++++") img_file_name = input_dir + "img_data.mat" mat = loadmat(img_file_name) img_data = mat['img_data'] n, l, m = img_data.shape img_data = np.log10(img_data) print("The matrix dimension of image data is " + str(img_data.shape)) print("+++++++Read the imaging coordinate data+++++++") coord_file_name = input_dir + "coord_data.txt" coord_data = np.loadtxt(coord_file_name) # d = coord_data.shape[1] print("The matrix dimension of coordinate data is " + str(coord_data.shape)) print("+++++++Read the SNP data+++++++") snp_file_name = input_dir + "snp_data.txt" snp_data = np.loadtxt(snp_file_name) # g = snp_data.shape[1] print("The matrix dimension of original snp data is " + str(snp_data.shape)) print("+++++++Read the covariate data+++++++") design_data_file_name = input_dir + "design_data.txt" design_data = np.loadtxt(design_data_file_name) # design_data = design_data[:, np.arange(5)] print("The matrix dimension of covariate data is " + str(design_data.shape)) # read the covariate type var_type_file_name = input_dir + "var_type.txt" var_type = np.loadtxt(var_type_file_name) # read the image size img_size_file_name = input_dir + "img_size.txt" img_size = np.loadtxt(img_size_file_name) # read the image index of non-background region img_idx_file_name = input_dir + "img_idx.txt" img_idx = np.loadtxt(img_idx_file_name) print("+++++++++Matrix preparing and Data preprocessing++++++++") print( "+++++++Construct the imaging response, design, coordinate matrix: normalization+++++++" ) x_design, y_design, coord_data = read_x(img_data, coord_data, design_data, var_type) p = x_design.shape[1] print("The dimension of normalized design matrix is " + str(x_design.shape)) print("+++++++Preprocess SNP: filtering+++++++") max_num = np.zeros(shape=(3, snp_data.shape[1])) for i in range(3): bw = np.zeros(snp_data.shape) bw[snp_data == i] = 1 max_num[i, :] = np.sum(bw, axis=0) max_num_idx = np.argmax(max_num, axis=0) indx = np.where(snp_data < 0) for i in range(len(indx[1])): snp_data[indx[0][i], indx[1][i]] = max_num_idx[indx[1][i]] min_maf = 0.05 # threshold for MAF maf = np.sum(snp_data, axis=0) / (2 * n) temp_idx = np.where(maf > 0.5) maf[temp_idx] = 1 - maf[temp_idx] rm_snp_index = np.where(maf <= min_maf) snp = np.delete(snp_data, rm_snp_index, axis=1) print("There are " + str(snp.shape[1]) + " snps with MAF>0.05.") """+++++++++++++++++++++++++++++++++++""" print( """ Step 1. Fit the multivariate varying coefficient model (MVCM) """) start_1 = time.time() # find the optimal bandwidth h_opt, hat_mat = bw_rt(coord_data, x_design, y_design) print("the optimal bandwidth by Scott's Rule is ", h_opt) qr_smy_mat, esig_eta, smy_design, resy_design, efit_eta = mvcm( coord_data, y_design, h_opt, hat_mat) end_1 = time.time() print("Elapsed time in Step 1 is ", end_1 - start_1) # print(esig_eta) # print(qr_smy_mat) for mii in range(m): res_mii = resy_design[:, :, mii] - efit_eta[:, :, mii] print("The bound of the residual is [" + str(np.min(res_mii)) + ", " + str(np.max(res_mii)) + "]") # res_img = np.reshape(np.mean(res_mii, axis=0), (int(img_size[0]), int(img_size[1]))) # res_img_file_name = output_dir + "residual_%d.txt" % mii # np.savetxt(res_img_file_name, res_img) """+++++++++++++++++++++++++++++++++++""" print(""" Step 2. Global sure independence screening (GSIS) """) start_2 = time.time() g_num = 1000 # number of top candidate snps g_pv_log10 = gsis(snp, qr_smy_mat, hat_mat)[0] g_pv_log10_file_name = output_dir + "g_pv_log10.txt" np.savetxt(g_pv_log10_file_name, g_pv_log10) snp_pv = 10**(-g_pv_log10) top_snp_idx = np.argsort(-g_pv_log10) top_snp = snp[:, top_snp_idx[0:g_num]] end_2 = time.time() print("Elapsed time in Step 2 is ", end_2 - start_2)
def run_script(input_dir, output_dir): """ Run the commandline script for FGWAS. :param input_dir (str): full path to the data folder output_dir (str): full path to the output folder """ """+++++++++++++++++++++++++++++++++++""" print(""" Step 0. load dataset """) print("+++++++Read the imaging data+++++++") img_file_name = input_dir + "img_data.mat" mat = loadmat(img_file_name) img_data = mat['img_data'] if len(img_data.shape) == 2: img_data = img_data.reshape(1, img_data.shape[0], img_data.shape[1]) m, n, n_v = img_data.shape y_design = np.log10(img_data) # log transformation on response print("The matrix dimension of image data is " + str(img_data.shape)) print("+++++++Read the imaging coordinate data+++++++") coord_file_name = input_dir + "coord_data.txt" coord_data = np.loadtxt(coord_file_name) print("The matrix dimension of coordinate data is " + str(coord_data.shape)) print("+++++++Read the SNP data+++++++") snp_file_name = input_dir + "snp_data.txt" snp_data = np.loadtxt(snp_file_name) print("The matrix dimension of original snp data is " + str(snp_data.shape)) print("+++++++Read the covariate data+++++++") design_data_file_name = input_dir + "design_data.txt" design_data = np.loadtxt(design_data_file_name) print("The matrix dimension of covariate data is " + str(design_data.shape)) # read the covariate type var_type_file_name = input_dir + "var_type.txt" var_type = np.loadtxt(var_type_file_name) var_type = np.array([int(i) for i in var_type]) print("+++++++++Matrix preparing and Data preprocessing++++++++") print("+++++++Construct the imaging response, design, coordinate matrix: normalization+++++++") x_design, coord_data = read_x(coord_data, design_data, var_type) p = x_design.shape[1] print("The dimension of normalized design matrix is " + str(x_design.shape)) print("+++++++Preprocess SNP: filtering+++++++") max_num = np.zeros(shape=(3, snp_data.shape[1])) for i in range(3): bw = np.zeros(snp_data.shape) bw[snp_data == i] = 1 max_num[i, :] = np.sum(bw, axis=0) max_num_idx = np.argmax(max_num, axis=0) indx = np.where(snp_data < 0) for i in range(len(indx[1])): snp_data[indx[0][i], indx[1][i]] = max_num_idx[indx[1][i]] min_maf = 0.05 # threshold for MAF maf = np.sum(snp_data, axis=0) / (2 * n) temp_idx = np.where(maf > 0.5) maf[temp_idx] = 1 - maf[temp_idx] rm_snp_index = np.where(maf <= min_maf) snp = np.delete(snp_data, rm_snp_index, axis=1) g = snp.shape[1] print("There are " + str(snp.shape[1]) + " snps with MAF>0.05.") """+++++++++++++++++++++++++++++++++++""" print(""" Step 1. Fit the multivariate varying coefficient model (MVCM) under H0 """) start_1 = time.time() # find the optimal bandwidth h_opt, hat_mat = bw_rt(coord_data, x_design, y_design) print("the optimal bandwidth by Scott's Rule is ", h_opt) qr_smy_mat, esig_eta, smy_design, resy_design, efit_eta = mvcm(coord_data, y_design, h_opt, hat_mat) end_1 = time.time() print("Elapsed time in Step 1 is ", end_1 - start_1) for mii in range(m): res_mii = resy_design[mii, :, :]-efit_eta[mii, :, :] print("The bound of the residual is [" + str(np.min(res_mii)) + ", " + str(np.max(res_mii)) + "]") """+++++++++++++++++++++++++++++++++++""" print(""" Step 2. Global sure independence screening (GSIS) """) start_2 = time.time() g_num = 1000 # number of top candidate snps g_pv_log10, g_stat = gsis(snp, qr_smy_mat, hat_mat) snp_pv = 10 ** (-g_pv_log10) top_snp_idx = np.argsort(-g_pv_log10) top_snp = snp[:, top_snp_idx[0:g_num]] snp_info_file = input_dir + "snp_info.map" fd = open(snp_info_file, 'r') snp_info = np.loadtxt(fd, delimiter='\t', dtype=bytes).astype(str) fd.close() snp_chr_tp = np.delete(snp_info[:, 0], rm_snp_index) snp_chr = np.array([int(i) for i in snp_chr_tp]) snp_name = np.delete(snp_info[:, 1], rm_snp_index) snp_bp_tp = np.delete(snp_info[:, 3], rm_snp_index) snp_bp = np.array([int(i) for i in snp_bp_tp]) gsis_all = np.vstack((snp_chr, snp_bp, snp_pv)).T # input for plotting Manhattan plot top_snp_chr = snp_chr[top_snp_idx[0:g_num]] top_snp_name = snp_name[top_snp_idx[0:g_num]] top_snp_bp = snp_bp[top_snp_idx[0:g_num]] top_snp_pv_log10 = g_pv_log10[top_snp_idx[0:g_num]] gsis_top = np.vstack((top_snp_name, top_snp_chr, top_snp_bp, top_snp_pv_log10)).T # top SNP GSIS results gsis_all_file_name = output_dir + "GSIS_all.txt" np.savetxt(gsis_all_file_name, gsis_all, delimiter="\t", fmt="%d %d %f") gsis_top_file_name = output_dir + "GSIS_top.txt" np.savetxt(gsis_top_file_name, gsis_top, delimiter="\t", fmt="%s", comments='', header="SNP\tCHR\tBP\tP") end_2 = time.time() print("Elapsed time in Step 2 is ", end_2 - start_2) # save results in temp folder for next step start_3 = time.time() temp_dir = output_dir + "/temp/" if not os.path.exists(temp_dir): os.makedirs(temp_dir) data_dim = np.array([n, n_v, m, p, g, g_num]) data_dim_file_name = temp_dir + "data_dim.mat" savemat(data_dim_file_name, mdict={'data_dim': data_dim}) all_snp_file_name = temp_dir + "snp.mat" savemat(all_snp_file_name, mdict={'snp': snp}) top_snp_file_name = temp_dir + "top_snp.mat" savemat(top_snp_file_name, mdict={'top_snp': top_snp}) y_design_file_name = temp_dir + "y_design.mat" savemat(y_design_file_name, mdict={'y_design': y_design}) resy_design_file_name = temp_dir + "resy_design.mat" savemat(resy_design_file_name, mdict={'resy_design': resy_design}) efit_eta_file_name = temp_dir + "efit_eta.mat" savemat(efit_eta_file_name, mdict={'efit_eta': efit_eta}) esig_eta_file_name = temp_dir + "esig_eta.mat" savemat(esig_eta_file_name, mdict={'esig_eta': esig_eta}) hat_mat_file_name = temp_dir + "hat_mat.mat" savemat(hat_mat_file_name, mdict={'hat_mat': hat_mat}) end_3 = time.time() print("Elapsed time in saving temp results is ", end_3 - start_3)