def test_fast_scanner_statsmodel_gls(): import statsmodels.api as sm from numpy.linalg import lstsq def _lstsq(A, B): return lstsq(A, B, rcond=None)[0] data = sm.datasets.longley.load() data.exog = sm.add_constant(data.exog) ols_resid = sm.OLS(data.endog, data.exog).fit().resid resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit() rho = resid_fit.params[1] order = toeplitz(range(len(ols_resid))) sigma = rho ** order QS = economic_qs(sigma) lmm = LMM(data.endog, data.exog, QS) lmm.fit(verbose=False) sigma = lmm.covariance() scanner = lmm.get_fast_scanner() best_beta_se = _lstsq(data.exog.T @ _lstsq(lmm.covariance(), data.exog), eye(7)) best_beta_se = sqrt(best_beta_se.diagonal()) assert_allclose(scanner.null_beta_se, best_beta_se, atol=1e-5) endog = data.endog.copy() endog -= endog.mean(0) endog /= endog.std(0) exog = data.exog.copy() exog -= exog.mean(0) with errstate(invalid="ignore", divide="ignore"): exog /= exog.std(0) exog[:, 0] = 1 lmm = LMM(endog, exog, QS) lmm.fit(verbose=False) sigma = lmm.covariance() scanner = lmm.get_fast_scanner() gls_model = sm.GLS(endog, exog, sigma=sigma) gls_results = gls_model.fit() beta_se = gls_results.bse our_beta_se = sqrt(scanner.null_beta_covariance.diagonal()) # statsmodels scales the covariance matrix we pass, that is why # we need to account for it here. assert_allclose(our_beta_se, beta_se / sqrt(gls_results.scale)) assert_allclose(scanner.null_beta_se, beta_se / sqrt(gls_results.scale))
def _perform_lmm(y, M, QS, G, verbose): from glimix_core.lmm import LMM from pandas import Series from xarray import DataArray lmm = LMM(y, M.values, QS) lmm.fit(verbose=verbose) sys.stdout.flush() null_lml = lmm.lml() beta = lmm.beta covariates = list(M.coords["covariate"].values) ncov_effsizes = Series(beta, covariates) flmm = lmm.get_fast_scanner() if hasattr(G, "data"): values = G.data else: values = G.values alt_lmls, effsizes = flmm.fast_scan(values, verbose=verbose) coords = { k: ("candidate", G.coords[k].values) for k in G.coords.keys() if G.coords[k].dims[0] == "candidate" } alt_lmls = DataArray(alt_lmls, dims=["candidate"], coords=coords) effsizes = DataArray(effsizes, dims=["candidate"], coords=coords) return QTLModel(null_lml, alt_lmls, effsizes, ncov_effsizes)
def test_fast_scanner_set_scale_1covariate(): random = RandomState(9458) n = 10 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M = random.randn(n, 1) lmm = LMM(y, M, QS) lmm.fit(verbose=False) assert_allclose(lmm.scale, 5.282731934070453) assert_allclose(lmm.delta, 0.7029974630034005) assert_allclose(lmm.beta, [0.0599712498212]) markers = M.copy() + random.randn(n, 1) scanner = lmm.get_fast_scanner() r = scanner.fast_scan(markers, verbose=False) assert_allclose(r["lml"], [-21.509721], rtol=1e-6) assert_allclose(r["effsizes0"], [[-1.43206379971882]]) assert_allclose(r["effsizes1"], [1.412239], rtol=1e-6) assert_allclose(r["scale"], [0.8440354018505616], rtol=1e-6) beta = lmm.beta assert_allclose( scanner.fast_scan(zeros((10, 1)), verbose=False)["effsizes0"][0], beta )
def test_lmm_scan_fast_scan(): random = RandomState(9458) n = 30 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M0 = random.randn(n, 2) M1 = random.randn(n, 2) lmm = LMM(y, M0, QS) lmm.fit(verbose=False) v0 = lmm.v0 v1 = lmm.v1 K = v0 * X @ X.T + v1 * eye(n) M = concatenate((M0, M1[:, [0]]), axis=1) def fun(x): beta = x[:3] scale = exp(x[3]) return -st.multivariate_normal(M @ beta, scale * K).logpdf(y) res = minimize(fun, [0, 0, 0, 0]) scanner = lmm.get_fast_scanner() r = scanner.fast_scan(M1, verbose=False) assert_allclose(r["lml"][0], -res.fun) assert_allclose(r["effsizes0"][0], res.x[:2], rtol=1e-5) assert_allclose(r["effsizes1"][0], res.x[2:3], rtol=1e-5) assert_allclose(r["scale"][0], exp(res.x[3]), rtol=1e-5)
def test_fast_scanner_set_scale_multicovariates(): random = RandomState(9458) n = 10 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M = random.randn(n, 3) lmm = LMM(y, M, QS) lmm.fit(verbose=False) markers = M.copy() scanner = lmm.get_fast_scanner() r = scanner.fast_scan(markers, verbose=False) want = [-19.318845, -19.318845, -19.318845] assert_allclose(r["lml"], want, rtol=1e-6, atol=1e-6) assert_allclose( r["effsizes0"][2], [-0.6923007382350215, 2.3550810825973034, -0.38157769653894497], rtol=1e-5, ) want = [-0.34615, 1.177541, -0.381578] assert_allclose(r["effsizes1"], want, rtol=1e-6, atol=1e-6) assert_allclose(r["scale"], [1.0, 1.0, 1.0])
def test_fast_scanner_set_scale_1covariate_redundant(): random = RandomState(9458) n = 10 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M = random.randn(n, 1) lmm = LMM(y, M, QS) lmm.fit(verbose=False) markers = M.copy() scanner = lmm.get_fast_scanner() r = scanner.fast_scan(markers, verbose=False) assert_allclose(r["lml"][0], -22.357525517597185, rtol=1e-6) assert_allclose(r["effsizes0"], [[0.029985622694805182]]) assert_allclose(r["effsizes1"][0], 0.02998562491058301, rtol=1e-6, atol=1e-6) assert_allclose(r["scale"], [1.0], rtol=1e-6)
def test_lmm_scan(): random = RandomState(9458) n = 30 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M0 = random.randn(n, 2) M1 = random.randn(n, 2) lmm = LMM(y, M0, QS) lmm.fit(verbose=False) v0 = lmm.v0 v1 = lmm.v1 K = v0 * X @ X.T + v1 * eye(n) M = concatenate((M0, M1), axis=1) def fun(x): beta = x[:4] scale = exp(x[4]) return -st.multivariate_normal(M @ beta, scale * K).logpdf(y) res = minimize(fun, [0, 0, 0, 0, 0]) scanner = lmm.get_fast_scanner() r = scanner.scan(M1) assert_allclose(r["lml"], -res.fun) assert_allclose(r["effsizes0"], res.x[:2], rtol=1e-5) assert_allclose(r["effsizes1"], res.x[2:4], rtol=1e-5) assert_allclose(r["scale"], exp(res.x[4]), rtol=1e-5) K = r["scale"] * lmm.covariance() M = concatenate((M0, M1), axis=1) effsizes_se = sqrt(inv(M.T @ solve(K, M)).diagonal()) assert_allclose(effsizes_se, concatenate((r["effsizes0_se"], r["effsizes1_se"]))) assert_allclose(scanner.null_lml(), -53.805721275578456, rtol=1e-5) assert_allclose(scanner.null_beta, [0.26521964226797085, 0.4334778669761928], rtol=1e-5) assert_allclose( scanner.null_beta_covariance, [ [0.06302553593799207, 0.00429640179038484], [0.004296401790384839, 0.05591392416235412], ], rtol=1e-5, ) assert_allclose(scanner.null_scale, 1.0) assert_allclose(scanner.null_beta, lmm.beta, rtol=1e-5) assert_allclose(scanner.null_beta_covariance, lmm.beta_covariance, rtol=1e-5)
def _lmm(y, M, QS, verbose): from glimix_core.lmm import LMM lmm = LMM(y, M, QS, restricted=False) lmm.fit(verbose=verbose) sys.stdout.flush() if QS is None: v0 = None else: v0 = lmm.v0 v1 = lmm.v1 scanner = ScannerWrapper(lmm.get_fast_scanner()) return scanner, v0, v1
def _st_lmm(Y, M, QS, verbose): from numpy import nan from glimix_core.lmm import LMM lmm = LMM(Y, M, QS, restricted=False) lmm.fit(verbose=verbose) sys.stdout.flush() if QS is None: v0 = nan else: v0 = lmm.v0 v1 = lmm.v1 return lmm.get_fast_scanner(), v0, v1
def test_lmm_scan_lmm_iid_prior(): random = RandomState(9458) n = 30 X = _covariates_sample(random, n, n + 1) markers = random.randn(n, 2) offset = 1.0 y = _outcome_sample(random, offset, X) lmm = LMM(y, ones((n, 1)), None) lmm.fit(verbose=False) scanner = lmm.get_fast_scanner() lmls = scanner.fast_scan(markers, verbose=False)["lml"] assert_allclose(lmls[:2], [-63.16019973550036, -62.489358539276715])
def test_fast_scanner_redundant_candidates(): random = RandomState(9458) n = 10 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M = ones((n, 5)) lmm = LMM(y, M, QS, restricted=False) lmm.fit(verbose=False) markers = M.copy() scanner = lmm.get_fast_scanner() scanner.fast_scan(markers, verbose=False)
def run_QTL_analysis(pheno_filename, anno_filename, geno_prefix, plinkGenotype, output_dir, window_size=250000, min_maf=0.05, min_hwe_P=0.001, min_call_rate=0.95, blocksize=1000, cis_mode=True, skipAutosomeFiltering=False, gaussianize_method=None, minimum_test_samples=10, seed=np.random.randint(40000), n_perm=0, write_permutations=False, relatedness_score=0.95, feature_variant_covariate_filename=None, snps_filename=None, feature_filename=None, snp_feature_filename=None, genetic_range='all', covariates_filename=None, kinship_filename=None, sample_mapping_filename=None, extended_anno_filename=None, regressCovariatesUpfront=False): fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0, copy=False) print('Running QTL analysis.') lik = 'normal' minimumProbabilityStep = 0.1 '''Core function to take input and run QTL tests on a given chromosome.''' if relatedness_score is not None: relatedness_score = float(relatedness_score) [phenotype_df, kinship_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\ utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode, skipAutosomeFiltering = skipAutosomeFiltering, minimum_test_samples= minimum_test_samples, relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range, covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename) mixed = kinship_df is not None if (kinship_df is None) or (relatedness_score is None): geneticaly_unique_individuals = sample2individual_df['iid'].values QS = None if (feature_list == None or len(feature_list) == 0): print('No features to be tested.') sys.exit() #Open output files qtl_loader_utils.ensure_dir(output_dir) if not selectionStart is None: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}.h5'.format(chromosome)) if (write_permutations): if not selectionStart is None: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd), n_perm) else: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm) #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness. tested_snp_ids = [] pass_qc_snps_all = [] fail_qc_snps_all = [] fail_qc_features = [] alpha_params = [] beta_params = [] n_samples = [] n_e_samples = [] na_containing_features = 0 currentFeatureNumber = 0 snpQcInfoMain = None for feature_id in feature_list: snpQcInfo = None currentFeatureNumber += 1 if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) geneticaly_unique_individuals = tmp_unique_individuals continue data_written = False contains_missing_samples = False snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df, bim, cis_mode, window_size, skipAutosomeFiltering) snp_cov_df = None if (feature_variant_covariate_df is not None): if (feature_id in feature_variant_covariate_df['feature'].values): covariateSnp = feature_variant_covariate_df['snp_id'].values[ feature_variant_covariate_df['feature'] == feature_id] if (any(i in bim['snp'].values for i in covariateSnp)): snpQuery_cov = bim.loc[ bim['snp'].map(lambda x: x in list(covariateSnp)), :] if (plinkGenotype): snp_cov_df = pd.DataFrame( data=bed[snpQuery_cov['i'].values, :].compute(). transpose(), index=fam.index, columns=snpQuery_cov['snp'], ) else: ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2! ##Also we don't use a minimal quality to assure a value is present for all samples. print( 'Warning, during the regression of SNPs we assume ploidy 2.' ) snp_cov_df_t = pd.DataFrame(columns=fam.index) rowNumber = 0 for snpId in snpQuery_cov['i']: geno = bgen["genotype"][snpId].compute() if (geno["phased"]): snp_df_dosage_t = geno["probs"][:, [0, 2]].sum( 1).astype(float) snp_df_dosage_t[( np.amax(geno["probs"][:, :2], 1) + np.amax(geno["probs"][:, 2:4], 1)) < ( 1 + minimumProbabilityStep)] = float('NaN') else: snp_df_dosage_t = (geno["probs"][:, 0] * 2) + geno["probs"][:, 1] snp_df_dosage_t[ np.amax(geno["probs"][:, :3], 1) < ( (1 / 3) + minimumProbabilityStep)] = float('NaN') snp_df_dosage_t = pd.Series(snp_df_dosage_t, index=fam.index) snp_df_dosage_t.name = snpId snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t) rowNumber = rowNumber + 1 snp_cov_df_t = snp_cov_df_t.transpose() if (len(snpQuery) != 0) and (snp_filter_df is not None): toSelect = set(snp_filter_df.index).intersection( set(snpQuery['snp'])) snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)] if (len(snpQuery) != 0) and (snp_feature_filter_df is not None): toSelect = set( np.unique(snp_feature_filter_df['snp_id'].loc[ snp_feature_filter_df['feature'] == feature_id])).intersection(set(snpQuery['snp'])) snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)] if len(snpQuery) == 0: print("Feature: " + feature_id + " not tested. No SNPS passed QC for phenotype.") fail_qc_features.append(feature_id) continue else: phenotype_ds = phenotype_df.loc[feature_id] contains_missing_samples = any(~np.isfinite(phenotype_ds)) if (contains_missing_samples): print('Feature: ' + feature_id + ' contains missing data.') phenotype_ds.dropna(inplace=True) na_containing_features = na_containing_features + 1 '''select indices for relevant individuals in genotype matrix These are not unique. NOT to be used to access phenotype/covariates data ''' individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values sample2individual_feature = sample2individual_df.loc[ phenotype_ds.index] if (contains_missing_samples): tmp_unique_individuals = geneticaly_unique_individuals if (kinship_df is not None) and (relatedness_score is not None): geneticaly_unique_individuals = utils.get_unique_genetic_samples( kinship_df.loc[individual_ids, individual_ids], relatedness_score) else: geneticaly_unique_individuals = individual_ids else: #If no missing samples we can use the previous SNP Qc information before actually loading data. #This allows for more efficient blocking and retrieving of data snpQuery = snpQuery.loc[snpQuery['snp'].map( lambda x: x not in list(map(str, fail_qc_snps_all)))] if phenotype_ds.empty or len( geneticaly_unique_individuals) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue elif np.var(phenotype_ds.values) == 0: print("Feature: " + feature_id + " has no variance in selected individuals.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue print('For feature: ' + str(currentFeatureNumber) + '/' + str(len(feature_list)) + ' (' + feature_id + '): ' + str(snpQuery.shape[0]) + ' SNPs need to be tested.\n Please stand by.') if (n_perm != 0): bestPermutationPval = np.ones((n_perm), dtype=np.float) #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix. #test if the covariates, kinship, snp and phenotype are in the same order if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\ (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)): ''' if all lines are in order put in arrays the correct genotype and phenotype x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b; better readability of the code ''' if kinship_df is not None: kinship_mat = kinship_df.loc[individual_ids, individual_ids].values kinship_mat = kinship_mat.astype(float) ##GOWER normalization of Kinship matrix. kinship_mat *= (kinship_mat.shape[0] - 1) / ( kinship_mat.trace() - kinship_mat.mean(0).sum()) ## This needs to go with the subselection stuff. if (QS is None and not contains_missing_samples): QS = economic_qs(kinship_mat) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(kinship_mat) if kinship_df is None: K = np.eye(len(phenotype_ds.index)) if (QS is None and not contains_missing_samples): QS = economic_qs(K) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(K) cov_matrix = covariate_df.loc[sample2individual_feature[ 'sample'], :].values if covariate_df is not None else None if covariate_df is None: cov_matrix = np.ones((len(individual_ids), 1)) if snp_cov_df is not None: snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :] snp_cov_df_tmp.index = sample2individual_feature['sample'] snp_cov_df = pd.DataFrame( fill_NaN.fit_transform(snp_cov_df_tmp)) snp_cov_df.index = snp_cov_df_tmp.index snp_cov_df.columns = snp_cov_df_tmp.columns cov_matrix = np.concatenate( (cov_matrix, snp_cov_df.values), 1) snp_cov_df_tmp = None snp_cov_df = None cov_matrix = cov_matrix.astype(float) else: print( 'There is an issue in mapping phenotypes vs covariates and/or kinship' ) sys.exit() phenotype = utils.force_normal_distribution( phenotype_ds.values, method=gaussianize_method ) if gaussianize_method is not None else phenotype_ds.values #Prepare LMM phenotype = phenotype.astype(float) ##Mixed and test. ##This is a future change so we don't need to decompose the COVs every time. ##Like QS this needs to happen when genetic unique individuals is the same. #svd_cov = economic_svd(cov_matrix) #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov) #These steps need to happen only once per phenotype. #print(QS) lmm = LMM(phenotype, cov_matrix, QS) if not mixed: lmm.delta = 1 lmm.fix('delta') #Prepare null model. lmm.fit(verbose=False) if regressCovariatesUpfront: phenotype_corrected = phenotype - cov_matrix[:, 1:].dot( lmm.beta[1:]) cov_matrix_corrected = cov_matrix[:, 0] lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS) lmm.fit(verbose=False) null_lml = lmm.lml() flmm = lmm.get_fast_scanner() countChunker = 0 for snpGroup in utils.chunker(snpQuery, blocksize): countChunker = countChunker + 1 #print(countChunker) #Fix seed at the start of the first chunker so all permutations are based on the same random first split. np.random.seed(seed) #print(snpGroup) snp_idxs = snpGroup['i'].values snp_names = snpGroup['snp'].values tested_snp_ids.extend(snp_names) #subset genotype matrix, we cannot subselect at the same time, do in two steps. if (plinkGenotype): snp_df = pd.DataFrame( data=bed[snp_idxs, :].compute().transpose(), index=fam.index, columns=snp_names) else: snp_df_dosage = pd.DataFrame(np.nan, index=fam.index, columns=snp_names) snp_df = pd.DataFrame(np.nan, index=fam.index, columns=snp_names) rowNumber = 0 for snpId in snp_idxs: geno = bgen["genotype"][snpId].compute() if (geno["ploidy"].min() > 1 & geno["ploidy"].max() < 3): if (geno["phased"]): snp_df_dosage_t = geno["probs"][:, [0, 2]].sum( 1).astype(float) snp_df_t = (np.abs( np.argmax(geno["probs"][:, :2], axis=1) - 1 ) + np.abs( np.argmax(geno["probs"][:, 2:4], axis=1) - 1)).astype(float) naId = (np.amax(geno["probs"][:, :2], 1) + np.amax(geno["probs"][:, 2:4], 1)) < ( 1 + minimumProbabilityStep) snp_df_dosage_t[naId] = float('NaN') snp_df_t[naId] = float('NaN') else: snp_df_dosage_t = ( (geno["probs"][:, 0] * 2) + geno["probs"][:, 1]).astype(float) snp_df_t = (np.abs( np.argmax(geno["probs"][:, :3], axis=1) - 2)).astype(float) naId = np.amax(geno["probs"][:, :3], 1) < ( (1 / 3) + minimumProbabilityStep) snp_df_dosage_t[naId] = float('NaN') snp_df_t[naId] = float('NaN') snp_df_dosage.loc[:, snp_names[ rowNumber]] = snp_df_dosage_t snp_df.loc[:, snp_names[rowNumber]] = snp_df_t rowNumber = rowNumber + 1 snp_df_dosage = snp_df_dosage.loc[individual_ids, :] snp_df = snp_df.loc[individual_ids, :] snp_df = snp_df.loc[:, np.unique(snp_df.columns)[ np.unique(snp_df.columns, return_counts=1)[1] == 1]] #SNP QC. if not contains_missing_samples: #remove SNPs from snp_df if they have previously failed QC snp_df = snp_df.loc[:, snp_df.columns[~snp_df.columns. isin(fail_qc_snps_all)]] if snp_df.shape[1] == 0: continue snps_to_test_df = snp_df.loc[:, snp_df.columns[ ~snp_df.columns.isin(pass_qc_snps_all)]] if snps_to_test_df.shape[1] > 0: #Only do QC on relevant SNPs. join pre-QCed list and new QCed list. if kinship_df is not None: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snps_to_test_df.iloc[np.unique( snps_to_test_df.index, return_index=1)[1]].loc[ geneticaly_unique_individuals, :], min_call_rate, min_maf, min_hwe_P) else: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snps_to_test_df, min_call_rate, min_maf, min_hwe_P) snps_to_test_df = None #append snp_names and failed_snp_names pass_qc_snps_all.extend(passed_snp_names) fail_qc_snps_all.extend(failed_snp_names) snp_df = snp_df.loc[:, snp_df.columns[snp_df.columns. isin(pass_qc_snps_all)]] else: #Do snp QC for relevant section. #Get relevant slice from: phenotype_ds if kinship_df is not None: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snp_df.iloc[np.unique( snp_df.index, return_index=1)[1]].loc[ geneticaly_unique_individuals, :], min_call_rate, min_maf, min_hwe_P) else: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snp_df, min_call_rate, min_maf, min_hwe_P) snp_df = snp_df.loc[:, snp_df.columns[snp_df.columns. isin(passed_snp_names)]] snpQcInfo_t = None if call_rate is not None: snpQcInfo_t = call_rate if maf is not None: snpQcInfo_t = pd.concat( [snpQcInfo_t, maf.reindex(snpQcInfo_t.index)], axis=1) if hweP is not None: snpQcInfo_t = pd.concat( [snpQcInfo_t, hweP.reindex(snpQcInfo_t.index)], axis=1) call_rate = None maf = None hweP = None if snpQcInfo is None and snpQcInfo_t is not None: snpQcInfo = snpQcInfo_t elif snpQcInfo_t is not None: snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t], axis=0, sort=False) ##First process SNPQc than check if we can continue. if len(snp_df.columns) == 0: continue elif (not plinkGenotype): snp_df_dosage = snp_df_dosage.loc[:, np.unique(snp_df.columns )] #We could make use of relatedness when imputing. And impute only based on genetically unique individuals. snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df), index=snp_df.index, columns=snp_df.columns) if (not plinkGenotype): snp_df_dosage = pd.DataFrame( fill_NaN.fit_transform(snp_df_dosage), index=snp_df_dosage.index, columns=snp_df_dosage.columns) ##No more snp_matrix_DF > snp_df # test if the covariates, kinship, snp and phenotype are in the same order if (len(snp_df.index) != len(sample2individual_feature.loc[ phenotype_ds.index]['iid']) or not all(snp_df.index == sample2individual_feature. loc[phenotype_ds.index]['iid'])): print( 'There is an issue in mapping phenotypes and genotypes' ) sys.exit() G = snp_df.values if (not plinkGenotype): G = snp_df_dosage.values G = G.astype(float) G_index = snp_df.columns alt_lmls, effsizes = flmm.fast_scan(G, verbose=False) var_pvalues = lrt_pvalues(null_lml, alt_lmls) var_effsizes_se = effsizes_se(effsizes, var_pvalues) #add these results to qtl_results temp_df = pd.DataFrame(index=range(len(G_index)), columns=[ 'feature_id', 'snp_id', 'p_value', 'beta', 'beta_se', 'empirical_feature_p_value' ]) temp_df['snp_id'] = G_index temp_df['feature_id'] = feature_id temp_df['beta'] = np.asarray(effsizes) temp_df['p_value'] = np.asarray(var_pvalues) temp_df['beta_se'] = np.asarray(var_effsizes_se) #insert default dummy value temp_df['empirical_feature_p_value'] = -1.0 if (n_perm != 0): pValueBuffer = [] totalSnpsToBeTested = (G.shape[1] * n_perm) permutationStepSize = np.floor( n_perm / (totalSnpsToBeTested / blocksize)) if (permutationStepSize > n_perm): permutationStepSize = n_perm elif (permutationStepSize < 1): permutationStepSize = 1 if (write_permutations): perm_df = pd.DataFrame( index=range(len(G_index)), columns=['snp_id'] + ['permutation_' + str(x) for x in range(n_perm)]) perm_df['snp_id'] = G_index for currentNperm in utils.chunker( list(range(1, n_perm + 1)), permutationStepSize): if (kinship_df is not None) and (relatedness_score is not None): if (plinkGenotype): temp = utils.get_shuffeld_genotypes_preserving_kinship( geneticaly_unique_individuals, relatedness_score, snp_df, kinship_df.loc[individual_ids, individual_ids], len(currentNperm)) else: temp = utils.get_shuffeld_genotypes_preserving_kinship( geneticaly_unique_individuals, relatedness_score, snp_df_dosage, kinship_df.loc[individual_ids, individual_ids], len(currentNperm)) else: if (plinkGenotype): temp = utils.get_shuffeld_genotypes( snp_df, len(currentNperm)) else: temp = utils.get_shuffeld_genotypes( snp_df_dosage, len(currentNperm)) temp = temp.astype(float) alt_lmls_p, effsizes_p = flmm.fast_scan(temp, verbose=False) var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p) pValueBuffer.extend(np.asarray(var_pvalues_p)) if (not (len(pValueBuffer) == totalSnpsToBeTested)): #print(len(pValueBuffer)) #print(pValueBuffer) #print(totalSnpsToBeTested) print('Error in blocking logic for permutations.') sys.exit() perm = 0 for relevantOutput in utils.chunker( pValueBuffer, G.shape[1]): if (write_permutations): perm_df['permutation_' + str(perm)] = relevantOutput if (bestPermutationPval[perm] > min(relevantOutput)): bestPermutationPval[perm] = min(relevantOutput) perm = perm + 1 #print(relevantOutput) #print('permutation_'+str(perm)) if not temp_df.empty: data_written = True output_writer.add_result_df(temp_df) if (write_permutations): permutation_writer.add_permutation_results_df( perm_df, feature_id) #This we need to change in the written file. if (n_perm > 1 and data_written): #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id); alpha_para, beta_para = output_writer.apply_pval_correction( feature_id, bestPermutationPval, cis_mode) #np.savetxt(output_dir+"/Permutation.pValues."+feature_id+".txt",bestPermutationPval) alpha_params.append(alpha_para) beta_params.append(beta_para) if not data_written: fail_qc_features.append(feature_id) else: n_samples.append(phenotype_ds.size) n_e_samples.append(len(geneticaly_unique_individuals)) if contains_missing_samples: QS = QS_tmp geneticaly_unique_individuals = tmp_unique_individuals del QS_tmp del tmp_unique_individuals if snpQcInfo is not None: snpQcInfo.index.name = "snp_id" snpQcInfo.to_csv( output_dir + '/snp_qc_metrics_naContaining_feature_{}.txt'.format( feature_id), sep='\t') else: if (snpQcInfo is not None and snpQcInfoMain is not None): snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo], axis=0, sort=False) elif snpQcInfo is not None: snpQcInfoMain = snpQcInfo.copy(deep=True) #if snpQcInfo is not None: #snpQcInfo2 = snpQcInfo.copy().transpose() #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t') #print('step 5') output_writer.close() if (write_permutations): permutation_writer.close() fail_qc_features = np.unique(fail_qc_features) if ((len(feature_list) - len(fail_qc_features)) == 0): time.sleep(15) #Safety timer to make sure the file is unlocked. print("Trying to remove the h5 file. Nothing has been tested.") print(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) if not selectionStart is None: os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome)) sys.exit() #gather unique indexes of tested SNPs tested_snp_ids = list(set(tested_snp_ids)) #write annotation and snp data to file snp_df = pd.DataFrame() snp_df['snp_id'] = bim['snp'] snp_df['chromosome'] = bim['chrom'] snp_df['position'] = bim['pos'] snp_df['assessed_allele'] = bim['a1'] snp_df.index = snp_df['snp_id'] snp_df = snp_df.drop_duplicates() snp_df = snp_df.reindex(tested_snp_ids) snp_df = snp_df.drop_duplicates() if snpQcInfoMain is not None: snpQcInfoMain['index'] = snpQcInfoMain.index snpQcInfoMain = snpQcInfoMain.drop_duplicates() del snpQcInfoMain['index'] snp_df = pd.concat( [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1) if (snp_df.shape[1] == 5): snp_df.columns = [ 'snp_id', 'chromosome', 'position', 'assessed_allele', 'call_rate' ] elif (snp_df.shape[1] == 6): snp_df.columns = [ 'snp_id', 'chromosome', 'position', 'assessed_allele', 'call_rate', 'maf' ] else: snp_df.columns = [ 'snp_id', 'chromosome', 'position', 'assessed_allele', 'call_rate', 'maf', 'hwe_p' ] feature_list = list(set(feature_list) - set(fail_qc_features)) annotation_df = annotation_df.reindex(feature_list) annotation_df['n_samples'] = n_samples annotation_df['n_e_samples'] = n_e_samples if (n_perm > 1): annotation_df['alpha_param'] = alpha_params annotation_df['beta_param'] = beta_params if not selectionStart is None: snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t') else: snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}.txt'.format(chromosome), sep='\t')
def test_fast_scanner_statsmodel_gls(): from numpy.linalg import lstsq def _lstsq(A, B): return lstsq(A, B, rcond=None)[0] # data = sm.datasets.longley.load() # data.exog = sm.add_constant(data.exog) # ols_resid = sm.OLS(data.endog, data.exog).fit().resid # resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit() # rho = resid_fit.params[1] rho = -0.3634294908774683 # order = toeplitz(range(len(ols_resid))) order = toeplitz(range(16)) sigma = rho**order QS = economic_qs(sigma) endog = reshape( [ 60323.0, 61122.0, 60171.0, 61187.0, 63221.0, 63639.0, 64989.0, 63761.0, 66019.0, 67857.0, 68169.0, 66513.0, 68655.0, 69564.0, 69331.0, 70551.0, ], (16, ), ) exog = reshape( [ 1.0, 83.0, 234289.0, 2356.0, 1590.0, 107608.0, 1947.0, 1.0, 88.5, 259426.0, 2325.0, 1456.0, 108632.0, 1948.0, 1.0, 88.2, 258054.0, 3682.0, 1616.0, 109773.0, 1949.0, 1.0, 89.5, 284599.0, 3351.0, 1650.0, 110929.0, 1950.0, 1.0, 96.2, 328975.0, 2099.0, 3099.0, 112075.0, 1951.0, 1.0, 98.1, 346999.0, 1932.0, 3594.0, 113270.0, 1952.0, 1.0, 99.0, 365385.0, 1870.0, 3547.0, 115094.0, 1953.0, 1.0, 100.0, 363112.0, 3578.0, 3350.0, 116219.0, 1954.0, 1.0, 101.2, 397469.0, 2904.0, 3048.0, 117388.0, 1955.0, 1.0, 104.6, 419180.0, 2822.0, 2857.0, 118734.0, 1956.0, 1.0, 108.4, 442769.0, 2936.0, 2798.0, 120445.0, 1957.0, 1.0, 110.8, 444546.0, 4681.0, 2637.0, 121950.0, 1958.0, 1.0, 112.6, 482704.0, 3813.0, 2552.0, 123366.0, 1959.0, 1.0, 114.2, 502601.0, 3931.0, 2514.0, 125368.0, 1960.0, 1.0, 115.7, 518173.0, 4806.0, 2572.0, 127852.0, 1961.0, 1.0, 116.9, 554894.0, 4007.0, 2827.0, 130081.0, 1962.0, ], (16, 7), ) lmm = LMM(endog, exog, QS) lmm.fit(verbose=False) sigma = lmm.covariance() scanner = lmm.get_fast_scanner() best_beta_se = _lstsq(exog.T @ _lstsq(lmm.covariance(), exog), eye(7)) best_beta_se = sqrt(best_beta_se.diagonal()) assert_allclose(scanner.null_beta_se, best_beta_se, atol=1e-4) endog = endog.copy() endog -= endog.mean(0) endog /= endog.std(0) exog = exog.copy() exog -= exog.mean(0) with errstate(invalid="ignore", divide="ignore"): exog /= exog.std(0) exog[:, 0] = 1 lmm = LMM(endog, exog, QS) lmm.fit(verbose=False) sigma = lmm.covariance() scanner = lmm.get_fast_scanner() # gls_model = sm.GLS(endog, exog, sigma=sigma) # gls_results = gls_model.fit() # scale = gls_results.scale scale = 1.7777777777782937 # beta_se = gls_results.bse beta_se = array([ 0.014636888951505144, 0.21334653097414055, 0.7428559936739378, 0.10174713767252333, 0.032745906589939845, 0.3494488802468581, 0.4644879873404213, ]) our_beta_se = sqrt(scanner.null_beta_covariance.diagonal()) # statsmodels scales the covariance matrix we pass, that is why # we need to account for it here. assert_allclose(our_beta_se, beta_se / sqrt(scale), rtol=1e-6) assert_allclose(scanner.null_beta_se, beta_se / sqrt(scale), rtol=1e-6)
def run_PrsQtl_analysis(pheno_filename, anno_filename, prsFile, output_dir, min_call_rate=0.95, blocksize=1000, skipAutosomeFiltering=False, gaussianize_method=None, minimum_test_samples=10, seed=np.random.randint(40000), n_perm=0, write_permutations=False, relatedness_score=None, feature_variant_covariate_filename=None, snps_filename=None, feature_filename=None, snp_feature_filename=None, genetic_range='all', covariates_filename=None, kinship_filename=None, sample_mapping_filename=None, regressCovariatesUpfront=False): fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0) print('Running GRS QT analysis.') lik = 'normal' '''Core function to take input and run QTL tests on a given chromosome.''' if relatedness_score is not None: relatedness_score = float(relatedness_score) [phenotype_df, kinship_df, covariate_df, sample2individual_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, risk_df, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\ utils.run_PrsQtl_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, prsFile=prsFile, skipAutosomeFiltering = skipAutosomeFiltering, minimum_test_samples= minimum_test_samples, relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range, covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, feature_variant_covariate_filename=feature_variant_covariate_filename) mixed = kinship_df is not None if (kinship_df is None) or (relatedness_score is None): geneticaly_unique_individuals = sample2individual_df['iid'].values QS = None if (feature_list == None or len(feature_list) == 0): print('No features to be tested.') sys.exit() #Open output files qtl_loader_utils.ensure_dir(output_dir) if not selectionStart is None: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}.h5'.format(chromosome)) if (write_permutations): if not selectionStart is None: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd), n_perm) else: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm) #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness. tested_snp_names = [] fail_qc_features = [] alpha_params = [] beta_params = [] n_samples = [] n_e_samples = [] na_containing_features = 0 currentFeatureNumber = 0 snpQcInfoMain = None for feature_id in feature_list: snpQcInfo = None currentFeatureNumber += 1 if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) geneticaly_unique_individuals = tmp_unique_individuals continue data_written = False contains_missing_samples = False snpQuery = risk_df.index.values snp_cov_df = None if (feature_variant_covariate_df is not None): if (feature_id in feature_variant_covariate_df['feature'].values): covariateSnp = feature_variant_covariate_df['snp_id'].values[ feature_variant_covariate_df['feature'] == feature_id] if (any(i in risk_df.index.values for i in covariateSnp)): snp_cov_df = risk_df.loc[risk_df.index.map( lambda x: x in list(covariateSnp)), :].transpose() if (len(snpQuery) != 0) and (snp_filter_df is not None): snpQuery = list( set(snp_filter_df.index).intersection(set(snpQuery))) if (len(snpQuery) != 0) and (snp_feature_filter_df is not None): snpQuery = list( set( np.unique(snp_feature_filter_df['snp_id'].loc[ snp_feature_filter_df['feature'] == feature_id])).intersection(set(snpQuery))) if len(snpQuery) == 0: print("Feature: " + feature_id + " not tested. No SNPS passed QC for phenotype.") fail_qc_features.append(feature_id) continue else: phenotype_ds = phenotype_df.loc[feature_id] contains_missing_samples = any(~np.isfinite(phenotype_ds)) if (contains_missing_samples): #import pdb; pdb.set_trace() print('Feature: ' + feature_id + ' contains missing data.') phenotype_ds.dropna(inplace=True) na_containing_features = na_containing_features + 1 '''select indices for relevant individuals in genotype matrix These are not unique. NOT to be used to access phenotype/covariates data ''' individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values sample2individual_feature = sample2individual_df.loc[ phenotype_ds.index] if contains_missing_samples: tmp_unique_individuals = geneticaly_unique_individuals if (kinship_df is not None) and (relatedness_score is not None): geneticaly_unique_individuals = utils.get_unique_genetic_samples( kinship_df.loc[individual_ids, individual_ids], relatedness_score) else: geneticaly_unique_individuals = individual_ids if phenotype_ds.empty or len( geneticaly_unique_individuals) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue elif np.var(phenotype_ds.values) == 0: print("Feature: " + feature_id + " has no variance in selected individuals.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue print('For feature: ' + str(currentFeatureNumber) + '/' + str(len(feature_list)) + ' (' + feature_id + '): ' + str(len(snpQuery)) + ' risk scores will be tested.\n Please stand by.') if (n_perm != 0): bestPermutationPval = np.ones((n_perm), dtype=np.float) #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix. # test if the covariates, kinship, snp and phenotype are in the same order if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\ (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)): ''' if all lines are in order put in arrays the correct genotype and phenotype x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b; better readability of the code ''' if kinship_df is not None: kinship_mat = kinship_df.loc[individual_ids, individual_ids].values kinship_mat = kinship_mat.astype(float) ##GOWER normalization of Kinship matrix. kinship_mat *= (kinship_mat.shape[0] - 1) / ( kinship_mat.trace() - kinship_mat.mean(0).sum()) ## This needs to go with the subselection stuff. if (QS is None and not contains_missing_samples): QS = economic_qs(kinship_mat) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(kinship_mat) if kinship_df is None: K = np.eye(len(phenotype_ds.index)) if (QS is None and not contains_missing_samples): QS = economic_qs(K) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(K) cov_matrix = covariate_df.loc[sample2individual_feature[ 'sample'], :].values if covariate_df is not None else None if covariate_df is None: cov_matrix = np.ones((len(individual_ids), 1)) #pdb.set_trace() if snp_cov_df is not None: snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :] snp_cov_df = pd.DataFrame( fill_NaN.fit_transform(snp_cov_df_tmp)) snp_cov_df.index = sample2individual_feature['sample'] snp_cov_df.columns = snp_cov_df_tmp.columns cov_matrix = np.concatenate( (cov_matrix, snp_cov_df.values), 1) snp_cov_df_tmp = None snp_cov_df = None cov_matrix = cov_matrix.astype(float) else: print( 'There is an issue in mapping phenotypes vs covariates and/or kinship' ) sys.exit() phenotype = utils.force_normal_distribution( phenotype_ds.values, method=gaussianize_method ) if gaussianize_method is not None else phenotype_ds.values #Prepare LMM phenotype = phenotype.astype(float) ##Mixed and test. ##This is a future change so we don't need to decompose the COVs every time. ##Like QS this needs to happen when genetic unique individuals is the same. #svd_cov = economic_svd(cov_matrix) #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov) #These steps need to happen only once per phenotype. #print(QS) lmm = LMM(phenotype, cov_matrix, QS) if not mixed: lmm.delta = 1 lmm.fix('delta') #Prepare null model. lmm.fit(verbose=False) if regressCovariatesUpfront: phenotype_corrected = phenotype - cov_matrix[:, 1:].dot( lmm.beta[1:]) cov_matrix_corrected = cov_matrix[:, 0] lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS) lmm.fit(verbose=False) null_lml = lmm.lml() flmm = lmm.get_fast_scanner() #pdb.set_trace(); for snpGroup in utils.chunker(snpQuery, blocksize): #Fix seed at the start of the first chunker so all permutations are based on the same random first split. np.random.seed(seed) snp_names = snpGroup tested_snp_names.extend(snp_names) snp_matrix_DF = risk_df.loc[snp_names, individual_ids].transpose() ##GRS var QC snp_matrix_DF = snp_matrix_DF.loc[:, snp_matrix_DF.isna().sum( axis=0) != snp_matrix_DF. shape[0], ] snp_matrix_DF = snp_matrix_DF.loc[:, ( np.nanstd(snp_matrix_DF, axis=0) > 0)] # test if the covariates, kinship, snp and phenotype are in the same order if (len(snp_matrix_DF.index) != len( sample2individual_feature.loc[phenotype_ds.index] ['iid']) or not all( snp_matrix_DF.index == sample2individual_feature.loc[ phenotype_ds.index]['iid'])): print( 'There is an issue in mapping phenotypes and genotypes' ) sys.exit() #Impute missingness #pdb.set_trace() call_rate = 1 - snp_matrix_DF.isnull().sum() / len( snp_matrix_DF.index) if snpQcInfo is None and call_rate is not None: snpQcInfo = call_rate elif call_rate is not None: snpQcInfo = pd.concat([snpQcInfo, call_rate], axis=0) selection = call_rate > min_call_rate snp_matrix_DF = snp_matrix_DF.loc[:, list(snp_matrix_DF. columns[selection])] if snp_matrix_DF.shape[1] == 0: continue snp_matrix_DF = pd.DataFrame( fill_NaN.fit_transform(snp_matrix_DF), index=snp_matrix_DF.index, columns=snp_matrix_DF.columns) # G = snp_matrix_DF.values G = G.astype(float) G_index = snp_matrix_DF.columns alt_lmls, effsizes = flmm.fast_scan(G, verbose=False) var_pvalues = lrt_pvalues(null_lml, alt_lmls) var_effsizes_se = effsizes_se(effsizes, var_pvalues) #add these results to qtl_results temp_df = pd.DataFrame(index=range(len(G_index)), columns=[ 'feature_id', 'snp_id', 'p_value', 'beta', 'beta_se', 'empirical_feature_p_value' ]) temp_df['snp_id'] = G_index temp_df['feature_id'] = feature_id temp_df['beta'] = np.asarray(effsizes) temp_df['p_value'] = np.asarray(var_pvalues) temp_df['beta_se'] = np.asarray(var_effsizes_se) #insert default dummy value temp_df['empirical_feature_p_value'] = -1.0 if (n_perm != 0): pValueBuffer = [] totalSnpsToBeTested = (G.shape[1] * n_perm) permutationStepSize = np.floor( n_perm / (totalSnpsToBeTested / blocksize)) if (permutationStepSize > n_perm): permutationStepSize = n_perm elif (permutationStepSize < 1): permutationStepSize = 1 if (write_permutations): perm_df = pd.DataFrame( index=range(len(G_index)), columns=['snp_id'] + ['permutation_' + str(x) for x in range(n_perm)]) perm_df['snp_id'] = G_index for currentNperm in utils.chunker( list(range(1, n_perm + 1)), permutationStepSize): if (kinship_df is not None) and (relatedness_score is not None): temp = utils.get_shuffeld_genotypes_preserving_kinship( geneticaly_unique_individuals, relatedness_score, snp_matrix_DF, kinship_df.loc[individual_ids, individual_ids], len(currentNperm)) else: temp = utils.get_shuffeld_genotypes( snp_matrix_DF, len(currentNperm)) temp = temp.astype(float) alt_lmls_p, effsizes_p = flmm.fast_scan(temp, verbose=False) var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p) pValueBuffer.extend(np.asarray(var_pvalues_p)) if (not (len(pValueBuffer) == totalSnpsToBeTested)): #print(len(pValueBuffer)) #print(pValueBuffer) #print(totalSnpsToBeTested) print('Error in blocking logic for permutations.') sys.exit() perm = 0 for relevantOutput in utils.chunker( pValueBuffer, G.shape[1]): if (write_permutations): perm_df['permutation_' + str(perm)] = relevantOutput if (bestPermutationPval[perm] > min(relevantOutput)): bestPermutationPval[perm] = min(relevantOutput) perm = perm + 1 #print(relevantOutput) #print('permutation_'+str(perm)) if not temp_df.empty: data_written = True output_writer.add_result_df(temp_df) if (write_permutations): permutation_writer.add_permutation_results_df( perm_df, feature_id) #This we need to change in the written file. if (n_perm > 1 and data_written): #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id); alpha_para, beta_para = output_writer.apply_pval_correction( feature_id, bestPermutationPval, False) alpha_params.append(alpha_para) beta_params.append(beta_para) #pdb.set_trace(); if not data_written: fail_qc_features.append(feature_id) else: n_samples.append(phenotype_ds.size) n_e_samples.append(len(geneticaly_unique_individuals)) if contains_missing_samples: QS = QS_tmp geneticaly_unique_individuals = tmp_unique_individuals snpQcInfo = snpQcInfo.to_frame(name="call_rate") snpQcInfo.index.name = "snp_id" snpQcInfo.to_csv( output_dir + '/snp_qc_metrics_naContaining_feature_{}.txt'.format( feature_id), sep='\t') del QS_tmp del tmp_unique_individuals else: if (snpQcInfo is not None and snpQcInfoMain is not None): snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo], axis=0) elif snpQcInfo is not None: snpQcInfoMain = snpQcInfo.copy(deep=True) #print('step 5') output_writer.close() if (write_permutations): permutation_writer.close() fail_qc_features = np.unique(fail_qc_features) if ((len(feature_list) - len(fail_qc_features)) == 0): time.sleep(15) #Safety timer to make sure the file is unlocked. print("Trying to remove the h5 file. Nothing has been tested.") print(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) if not selectionStart is None: os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome)) sys.exit() #gather unique indexes of tested snps #write annotation and snp data to file snp_df = pd.DataFrame() snp_df['snp_id'] = np.unique(tested_snp_names) snp_df.index = np.unique(tested_snp_names) snp_df['chromosome'] = "NA" snp_df['position'] = "NA" if (snpQcInfoMain is not None): snpQcInfoMain = snpQcInfoMain.to_frame(name="call_rate") snpQcInfoMain['index'] = snpQcInfoMain.index snpQcInfoMain = snpQcInfoMain.drop_duplicates() del snpQcInfoMain['index'] snp_df = pd.concat( [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1) feature_list = list(set(feature_list) - set(fail_qc_features)) annotation_df = annotation_df.reindex(feature_list) annotation_df['n_samples'] = n_samples annotation_df['n_e_samples'] = n_e_samples if (n_perm > 1): annotation_df['alpha_param'] = alpha_params annotation_df['beta_param'] = beta_params if not selectionStart is None: snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t') else: snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}.txt'.format(chromosome), sep='\t')