def plot_phenotype_by_sex(phenotype): shared_covars = np.load(f'{ukb}/traits/shared_covars/shared_covars.npy') pheno_data = np.load(f'{ukb}/traits/phenotypes/{phenotype}.npy') with open(f'{ukb}/traits/phenotypes/{phenotype}_unit.txt') as unit_file: unit = next(unit_file).strip() data = utils.merge_arrays(shared_covars, pheno_data) data = data[:, [shared_covars.shape[1], 1]] data = data[~np.any(np.isnan(data), axis=1), :] if len(np.unique(data[:, 0])) < 2000: plot_histogram( data, f'{phenotype} ({unit})', phenotype.capitalize() + ' x Sex distribution', f'{ukb}/traits/phenotypes/{phenotype}_distribution_by_sex.png', { 1: 'male', 2: 'female' }) else: plot_1D_kde( data, f'{phenotype} ({unit})', phenotype.capitalize() + ' x Sex distribution', f'{ukb}/traits/phenotypes/{phenotype}_distribution_by_sex.png', { 1: 'male', 2: 'female' })
def samples_array_with_indicator(sample_fname): ''' sample_fname - a file with the first line 'ID' followed by one sample per line (7 digit number) (no negatives or missings) ''' all_samples = get_all_samples() with open(sample_fname) as samples_file: samples = np.array([line.strip() for line in samples_file][1:], dtype=int).reshape(-1, 1) samples_indicator = np.concatenate((samples, samples), axis=1) samples_merge = utils.merge_arrays(all_samples, samples_indicator) assert samples_merge.shape[1] == 2 return samples_merge
def main(): # noqa: D103 parser = argparse.ArgumentParser() parser.add_argument('outprefix') parser.add_argument('pheno_data') parser.add_argument('samples') parser.add_argument('phenotype') parser.add_argument('ethnicity') parser.add_argument('--binary', default=False, action='store_true') args = parser.parse_args() with open(f'{args.outprefix}_README.txt', 'w') as readme: today = datetime.datetime.now().strftime("%Y_%m_%d") readme.write(f"Run date: {today}\n") readme.write( "Subsetting to samples with phenotype that passed sample_qc, " f"as denoted by the file: {args.samples}\n") readme.flush() data = np.load(args.pheno_data) with open(args.samples) as sample_file: next(sample_file) samples = np.array([int(sample.strip()) for sample in sample_file]) samples = samples.reshape(-1, 1) data = utils.merge_arrays(samples, data) readme.write( "Standardizing covariates (subtracting mean, then dividing by standard deviation)\n" ) readme.flush() covariates = data[:, 2:] standardized_covariates = \ (covariates - covariates.mean(axis=0))/covariates.std(axis=0) if not args.binary: ranks = rank_phenotypes(readme, data) rin_ranks = inverse_normalize_ranks(readme, ranks) transformed_data = np.concatenate( (samples, rin_ranks.reshape(-1, 1), standardized_covariates), axis=1) else: readme.write( "Binary outcome is left untransformed (0=case, 1=control)\n") phenotype = data[:, 1:2] transformed_data = np.concatenate( (samples, phenotype, standardized_covariates), axis=1) np.save(f'{args.outprefix}.npy', transformed_data)
parser.add_argument('outfname') args = parser.parse_args() with open(args.samples_fname) as samples_file: samples = np.array([line.strip() for line in samples_file][1:], dtype=int).reshape(-1, 1) imp_snp_samples_filepath = f'{ukb}/array_imputed/ukb46122_imp_chr1_v3_s487283.sample' with open(imp_snp_samples_filepath) as imp_snp_samples_file: imp_snp_samples = np.array([ line.split()[0] for line in imp_snp_samples_file ][2:], dtype=int).reshape(-1, 1) samples_indicator = np.concatenate((samples, samples), axis=1) samples_merge = utils.merge_arrays(imp_snp_samples, samples_indicator) assert samples_merge.shape[1] == 2 sample_idx = ~np.isnan(samples_merge[:, 1]) strs = lfg.load_strs('first_pass', f'{args.chrom}:{args.pos}-{args.pos}', sample_idx, details=False) single_chrom_dosages = next(strs)[0] summed_dosages = collections.defaultdict(lambda: np.zeros(np.sum(sample_idx))) for len1, dosages1 in single_chrom_dosages.items(): #print(len1, dosages1) for len2, dosages2 in single_chrom_dosages.items(): if len2 < len1: continue
parser = argparse.ArgumentParser() parser.add_argument('phenotype') parser.add_argument('--conditional') parser.add_argument('--binary', default=False, choices={'logistic', 'linear'}) args = parser.parse_args() phenotype = args.phenotype shared_covars = np.load(f'{ukb}/traits/shared_covars/shared_covars.npy') subset_transformed_phenotype = np.load(f'{ukb}/traits/subset_transformed_phenotypes/white_brits/{phenotype}.npy') # shared covars here aren't necessarily going to have exactly mean 0 and std 1 # because they were standardized before subsetting, but that's okay data = utils.merge_arrays( subset_transformed_phenotype, shared_covars ) data = np.concatenate((data[:, 0:1], data), axis=1) col_names = ['FID', 'IID'] if not args.binary: col_names.append(f'rin_{phenotype}') else: col_names.append(phenotype) if args.binary == 'logistic': # plink expects and ouptuts a 1=control, 2=case encoding # instead of the 0=control, 1=case encoding we use elsewhere # (from: https://www.cog-genomics.org/plink/2.0/input # under the `--1` section) data[:, 2] += 1 else:
today = datetime.datetime.now().strftime("%Y_%m_%d") data_fname = f'{ukb}/main_dataset/extracted_data/{phenotype}_{args.phenotype_field_id}.txt' readme.write(f"Run date: {today}\n") readme.write(f"Loading phenotype {phenotype} from txt file " f" {data_fname} \n") data = np.genfromtxt(data_fname, skip_header=1, delimiter='\t')[:, 1:-1] # drop first and last rows which because of the way data is extracted # and then read by numpy are always nans readme.write(f"Subsetting to samples at {args.samples}\n") # load samples that have passed qc samples = np.genfromtxt(args.samples, skip_header=1) # subset to those samples data = utils.merge_arrays(samples.reshape(-1, 1), data) # number of sample with this phenotype at any assessment num_samples = np.sum(np.any(~np.isnan(data[:, 1:]), axis=1)) # drop samples with categorical covars with less than this # number of samples or fraction of samples cat_drop_num = 50 cat_drop_frac = 0.001 #0.1% covar_datas = [] reverse_covar_hashes = [] for covar in args.categorical_covars: covar_name, covar_id = covar.split(',') covar_fname = f'{ukb}/main_dataset/extracted_data/{covar_name}_{covar_id}.txt' readme.write(
]) assert result.shape[0] == 1 pheno_data = np.load(pheno_datas_d[ethnicity]) bgen_samples = [] with open(f'{ukb}/microarray/ukb46122_hap_chr1_v2_s487314.sample') as samplefile: for num, line in enumerate(samplefile): if num <= 1: # skip first two lines continue bgen_samples.append(line.split()[0]) assert len(bgen_samples) == 487409 samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1) merged_arr = utils.merge_arrays(samples_array, pheno_data) unfiltered_subset = ~np.isnan(merged_arr[:, 1]) n_samples = np.sum(unfiltered_subset) vcf = cyvcf2.VCF(args.imputed_vcf) found_rec = False for record in vcf(f'{args.chrom}:{args.pos}-{args.pos}'): if record.POS < args.pos: continue if record.INFO.get('PERIOD') is None: continue assert not found_rec found_rec = True trrecord = trh.HarmonizeRecord(vcfrecord=record, vcftype='beagle-hipstr')
if not args.zero_one_neg_nan: # cols: id, date first reported data = load_date_data_field(data_fname) # cols: id, is_case data[:, 1] = ~np.isnan(data[:, 1]) else: # cols: id, is_case data = load_0_1_neg_nan_field(data_fname) readme.write(f"Subsetting to samples at {args.samples}\n") # load samples that have passed qc samples = np.genfromtxt(args.samples, skip_header=1) # TODO test this # subset to those samples data = utils.merge_arrays(samples.reshape(-1, 1), data) year_of_birth = np.genfromtxt( f'{ukb}/main_dataset/extracted_data/year_of_birth_34.txt', delimiter='\t', skip_header=1)[:, 1:-1] month_of_birth = np.genfromtxt( f'{ukb}/main_dataset/extracted_data/month_of_birth_52.txt', delimiter='\t', skip_header=1)[:, 1:-1] # cols: id, is case, year of birth, month of birth data = utils.merge_arrays(utils.merge_arrays(data, year_of_birth), month_of_birth) missing_birth = np.isnan(data[:, 2]) | np.isnan(data[:, 3]) assert np.sum(missing_birth) < 100
def generate_figure(assoc_results_fname, pheno_data_fname, chrom, pos, phenotype, dosage_fraction_threshold, unit, binary, publication): assert bool(unit) or binary assert 0 <= dosage_fraction_threshold <= 1 if not binary: y_axis_label = 'Mean ' + phenotype.replace('_', ' ') + f' ({unit})' else: y_axis_label = 'Fraction ' + phenotype.replace('_', ' ') + ' cases' figure = bokeh.plotting.figure( width=600, height=600, y_axis_label=y_axis_label, x_axis_label='Sum of allele lengths (repeat copies)') figure.grid.grid_line_color = None figure.background_fill_color = None figure.border_fill_color = None figure.toolbar_location = None figure.title.text_font_size = '18px' figure.axis.axis_label_text_font_size = '18px' figure.axis.major_label_text_font_size = '14px' if not binary: stat_name = 'mean' else: stat_name = 'fraction' def fix_header(header): def fix_header_helper(_): part1 = header.rpartition('0.05_significance_CI') fix1 = part1[0] + 'foo' + part1[2] part2 = fix1.rpartition('5e-8_significance_CI') fix2 = part2[0] + 'bar' + part2[2] return fix2.split('\t') return fix_header_helper with open(assoc_results_fname) as tsv: header = tsv.readline().strip() result = pl.scan_csv( assoc_results_fname, sep='\t', dtypes={ 'locus_filtered': str }, skip_rows=1, has_header=False, with_column_names=fix_header(header)).filter( (pl.col('chrom') == chrom) & (pl.col('pos') == pos)).collect().select( [ # have to collect first due to some sort of bug 'motif', '0.05_significance_CI', '5e-8_significance_CI', f'{stat_name}_{phenotype}_per_single_dosage', 'total_subset_dosage_per_summed_gt' ]) assert result.shape[0] == 1 pheno_data = np.load(pheno_data_fname) bgen_samples = sample_utils.get_all_samples() assert len(bgen_samples) == 487409 samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1) merged_arr = utils.merge_arrays(samples_array, pheno_data) unfiltered_subset = ~np.isnan(merged_arr[:, 1]) n_samples = np.sum(unfiltered_subset) subset_summed_dosage_fractions = { float(allele): val for allele, val in ast.literal_eval( result['total_subset_dosage_per_summed_gt'].to_numpy()[0]).items() } total_dosage = np.sum(list(subset_summed_dosage_fractions.values())) subset_summed_dosage_fractions = { key: val / total_dosage for key, val in subset_summed_dosage_fractions.items() } alleles = list(subset_summed_dosage_fractions.keys()) alleles_copy = alleles.copy() for allele in alleles_copy: if subset_summed_dosage_fractions[allele] < dosage_fraction_threshold: alleles.remove(allele) alleles = sorted(alleles) mean_per_dosage = { float(allele): val for allele, val in ast.literal_eval( result[f'{stat_name}_{phenotype}_per_single_dosage'].to_numpy() [0]).items() } ci5e_2 = { float(allele): val for allele, val in ast.literal_eval( result['0.05_significance_CI'].to_numpy()[0]).items() } ci5e_8 = { float(allele): val for allele, val in ast.literal_eval( result['5e-8_significance_CI'].to_numpy()[0]).items() } y_min = min(ci5e_8[allele][0] for allele in alleles) y_max = max(ci5e_8[allele][1] for allele in alleles) figure.varea(alleles, [ci5e_2[allele][1] for allele in alleles], [ci5e_8[allele][1] for allele in alleles], color="red", alpha=0.2, legend_label='1 - 5e-8 Confidence Interval') figure.varea(alleles, [ci5e_2[allele][0] for allele in alleles], [ci5e_2[allele][1] for allele in alleles], color="red", alpha=0.4, legend_label='0.95 Confidence Interval') figure.varea(alleles, [ci5e_8[allele][0] for allele in alleles], [ci5e_2[allele][0] for allele in alleles], color="red", alpha=0.2) figure.line(alleles, [mean_per_dosage[allele] for allele in alleles], line_width=2, color="black") figure.circle(alleles, [mean_per_dosage[allele] for allele in alleles], color="black", size=6, legend_label='mean') figure.legend.label_text_font_size = '10px' figure.y_range = bokeh.models.Range1d(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) figure.add_layout( bokeh.models.Title(text=f'STR {chrom}:{pos}', align="center", text_font_size='18px'), "above") figure.add_layout( bokeh.models.Title(text=phenotype.replace('_', ' ').capitalize() + " vs genotype", align="center", text_font_size='18px'), "above") if not publication: figure.add_layout( bokeh.models.Title( text="Phenotype values are unadjusted for covariates", align="center"), "below") figure.add_layout( bokeh.models.Title( text= "People contribute to each genotype based on their prob. of having that genotype", align="center"), "below") figure.add_layout( bokeh.models.Title(text="Only considers tested individuals", align="center"), "below") figure.add_layout( bokeh.models.Title( text= f"Genotypes with dosages less than {100*dosage_fraction_threshold}% of the population are omitted", align="center"), "below") return figure
def main(): # do an association test of the combined dosage of the listed alleles vs the listed phenotypes in each ethnicity parser = argparse.ArgumentParser() #parser.add_argument('chrom', type=int) #parser.add_argument('pos', type=int) parser.add_argument('var_file') #parser.add_argument('--phenotypes', nargs='+') #parser.add_argument('--alleles', type=int, nargs='+') #allele indicies, i.e. ths SNP is present in alleles 0 (ref), 2 (2nd alt) and 5 args = parser.parse_args() print( 'str\tvariant\tethnicity\tvar frequency\tstr var r2\tphenotype\tstr p-val\tvar p-val\tstr p-val conditioning on var' ) scovs = np.load(f'{ukb}/traits/shared_covars/shared_covars.npy') variants = pl.read_csv(args.var_file, sep='\t') for i in range(variants.shape[0]): chrom = variants[i, 'chrom'] pos = variants[i, 'pos'] str_ = f'{chrom}:{pos}' var = next( cyvcf2.VCF( f'{ukb}/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chrom}.vcf.gz' )(str_)) alleles = [int(num) for num in variants[i, 'alleles'].split(',')] name = variants[i, 'name'] phenotypes = variants[i, 'phenos'].split(',') for phenotype in phenotypes: for ethnicity in ('white_brits', 'black', 'south_asian', 'chinese', 'irish', 'white_other'): #ethnicity total_samp_idx = sample_utils.get_samples_idx_ethnicity( ethnicity) total_var_gts = var_dosage_gts(var, total_samp_idx, alleles) total_str_gts = str_dosage_gts(var, total_samp_idx) var_freq = np.sum(total_var_gts) / (2 * total_var_gts.shape[0]) corr = np.corrcoef(total_var_gts, total_str_gts) assert corr.shape == (2, 2) str_var_r2 = corr[0, 1]**2 #ethnicity,pheno pcovs = np.load( f'{ukb}/traits/subset_transformed_phenotypes/{ethnicity}/{phenotype}.npy' ) samps = sample_utils.get_ordered_samples_phenotype( ethnicity, phenotype).reshape(-1, 1) covs = python_array_utils.merge_arrays( python_array_utils.merge_arrays(samps, pcovs), scovs) outcomes = covs[:, 1] covs = covs[:, 2:] samp_idx = sample_utils.get_samples_idx_phenotype( ethnicity, phenotype) var_gts = standardize(var_dosage_gts(var, samp_idx, alleles)) str_gts = standardize(str_dosage_gts(var, samp_idx)) str_best_guess_gts = trh.HarmonizeRecord( vcfrecord=var, vcftype='beagle-hipstr').GetGenotypeIndicies()[ samp_idx, :-1] str_p = OLS( outcomes, np.hstack((covs, np.ones((covs.shape[0], 1)), str_gts.reshape(-1, 1)))).fit().pvalues[-1] if np.all(var_gts == 0) or np.all(var_gts == 2): var_p = 1 str_cond_p = str_p else: var_p = OLS( outcomes, np.hstack((covs, np.ones((covs.shape[0], 1)), var_gts.reshape(-1, 1)))).fit().pvalues[-1] str_cond_p = OLS( outcomes, np.hstack((covs, np.ones( (covs.shape[0], 1)), var_gts.reshape(-1, 1), str_gts.reshape(-1, 1)))).fit().pvalues[-1] print( f'{str_}\t{name}\t{ethnicity}\t{var_freq:.3g}\t{str_var_r2:.3g}\t{phenotype}\t{str_p:.3g}\t{var_p:.3g}\t{str_cond_p:.3g}' )
scovs = np.load(f'{ukb}/traits/shared_covars/shared_covars.npy') print( 'phenotype\tethnicity\tcompound coeff single test (s.d.)\tA coeff (s.d.)\tTA coeff (s.d.)\tCA coeff (s.d.)' ) for phenotype in ('mean_platelet_volume', 'platelet_distribution_width', 'platelet_count'): for ethnicity in ('white_brits', 'black', 'south_asian', 'chinese', 'irish', 'white_other'): print(f'{phenotype}\t{ethnicity}', end='') pcovs = np.load( f'{ukb}/traits/subset_transformed_phenotypes/{ethnicity}/{phenotype}.npy' ) samps = sample_utils.get_ordered_samples_phenotype( ethnicity, phenotype).reshape(-1, 1) covs = python_array_utils.merge_arrays( python_array_utils.merge_arrays(samps, pcovs), scovs) outcomes = covs[:, 1] covs = covs[:, 2:] samp_idx = sample_utils.get_samples_idx_phenotype(ethnicity, phenotype) itr = load_PACSIN2.get_gt_itr(samp_idx) next(itr) # skip details compound, a, ta, ca = [next(itr)[0] for _ in range(4)] compound, a, ta, ca = [ np.sum([ len_ * np.sum(dosages, axis=1) for len_, dosages in dosage_dict.items() ], axis=0).reshape(-1, 1) for dosage_dict in (compound, a, ta, ca)
def perform_regional_gwas_helper(outfile, pheno_and_covars_fname, shared_covars_fname, untransformed_phenotypes_fname, get_genotype_iter, phenotype, binary, region, runtype, conditional_covars_fname=None): outfile.write("chrom\tpos\talleles\tlocus_filtered\t" f"p_{phenotype}\tcoeff_{phenotype}\t") if binary != 'logistic': outfile.write(f'se_{phenotype}\tR^2\t') else: outfile.write("unused_col\tunused_col\t") outfile.flush() n_loci = 0 batch_time = 0 batch_size = 50 total_time = 0 pheno_specific_covars = np.load(pheno_and_covars_fname) shared_covars = np.load(shared_covars_fname) covars = utils.merge_arrays(pheno_specific_covars, shared_covars) if conditional_covars_fname: gt_covars = np.load(conditional_covars_fname) covars = utils.merge_arrays(covars, gt_covars) # order samples according to order in genetics files bgen_samples = sample_utils.get_all_samples() assert len(bgen_samples) == 487409 samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1) merge = utils.merge_arrays(samples_array, covars) unfiltered_samples = ~np.isnan(merge[:, 1]) outcome = merge[unfiltered_samples, 1].copy() covars = merge[unfiltered_samples, :] covars = (covars - np.mean(covars, axis=0)) / np.std(covars, axis=0) covars[:, 1] = 1 # reuse the column that was the outcome as the intercept ori_phenotypes = np.load(untransformed_phenotypes_fname) ori_phenotypes = utils.merge_arrays(samples_array, ori_phenotypes)[:, 1] ori_phenotypes = ori_phenotypes[unfiltered_samples] # first yield is special genotype_iter = get_genotype_iter(unfiltered_samples) extra_detail_fields = next(genotype_iter) outfile.write('\t'.join(extra_detail_fields) + '\t') if not binary: stat = 'mean' else: stat = 'fraction' outfile.write(f'{stat}_{phenotype}_per_single_dosage\t' '0.05_significance_CI\t' '5e-8_significance_CI') if runtype == 'strs': outfile.write('\ttotal_subset_dosage_per_summed_gt\t' f'{stat}_{phenotype}_per_paired_dosage\t' '0.05_significance_CI\t' '5e-8_significance_CI') outfile.write('\n') outfile.flush() start_time = time.time() for dosage_gts, unique_alleles, chrom, pos, locus_filtered, locus_details in genotype_iter: assert len(locus_details) == len(extra_detail_fields) covars[:, 0] = np.nan # reuse the column that was the ids as the genotypes n_loci += 1 allele_names = ','.join(list(unique_alleles.astype(str))) outfile.write(f"{chrom}\t{pos}\t{allele_names}\t") if locus_filtered: outfile.write(f'{locus_filtered}\t1\tnan\tnan\tnan\t') outfile.write('\t'.join(locus_details)) if runtype == 'strs': outfile.write('\tnan' * 6 + '\n') else: outfile.write('\tnan' * 3 + '\n') outfile.flush() continue else: outfile.write('False\t') if runtype == 'strs': gts = np.sum([ _len * np.sum(dosages, axis=1) for _len, dosages in dosage_gts.items() ], axis=0) else: gts = dosage_gts[:, 1] + 2 * dosage_gts[:, 2] std = np.std(gts) gts = (gts - np.mean(gts)) / np.std(gts) covars[:, 0] = gts if not binary or binary == 'linear': #do da regression model = OLS( outcome, covars, missing='drop', ) reg_result = model.fit() pval = reg_result.pvalues[0] coef = reg_result.params[0] se = reg_result.bse[0] rsquared = reg_result.rsquared outfile.write(f"{pval:.2e}\t{coef/std}\t{se/std}\t{rsquared}\t") else: model = sm.GLM(outcome, covars, missing='drop', family=sm.families.Binomial()) reg_result = model.fit() pval = reg_result.pvalues[0] coef = reg_result.params[0] outfile.write(f'{pval:.2e}\t{coef/std}\tnan\tnan\t') outfile.write('\t'.join(locus_details) + '\t') if runtype == 'strs': single_dosages = {} paired_dosages = {} for len1 in unique_alleles: for len2 in unique_alleles: if len1 > len2: continue if len1 != len2: dosages = ( dosage_gts[len1][:, 0] * dosage_gts[len2][:, 1] + dosage_gts[len1][:, 1] * dosage_gts[len2][:, 0]) else: dosages = dosage_gts[len1][:, 0] * dosage_gts[len1][:, 1] if np.sum(dosages) <= 0: continue summed_len = round(len1 + len2, 2) if summed_len not in single_dosages: single_dosages[summed_len] = dosages else: single_dosages[summed_len] += dosages minlen = min(len1, len2) maxlen = max(len1, len2) paired_dosages[(minlen, maxlen)] = dosages single_dosage_stat = {} single_dosage_95_CI = {} single_dosage_GWAS_CI = {} paired_dosage_stat = {} paired_dosage_95_CI = {} paired_dosage_GWAS_CI = {} if not binary: for _len, dosages in single_dosages.items(): if len(np.unique(ori_phenotypes[dosages != 0])) <= 1: continue mean_stats = statsmodels.stats.weightstats.DescrStatsW( ori_phenotypes, weights=dosages) single_dosage_stat[_len] = mean_stats.mean single_dosage_95_CI[_len] = mean_stats.tconfint_mean() single_dosage_GWAS_CI[_len] = mean_stats.tconfint_mean( 5e-8) for _len, dosages in paired_dosages.items(): if len(np.unique(ori_phenotypes[dosages != 0])) <= 1: continue mean_stats = statsmodels.stats.weightstats.DescrStatsW( ori_phenotypes, weights=dosages) paired_dosage_stat[_len] = mean_stats.mean paired_dosage_95_CI[_len] = mean_stats.tconfint_mean() paired_dosage_GWAS_CI[_len] = mean_stats.tconfint_mean( 5e-8) else: for _len, dosages in single_dosages.items(): if not np.any(dosages != 0): continue p, lower, upper = weighted_binom_conf.weighted_binom_conf( dosages, ori_phenotypes, 0.05) single_dosage_stat[_len] = p single_dosage_95_CI[_len] = (lower, upper) _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf( dosages, ori_phenotypes, 5e-8) single_dosage_GWAS_CI[_len] = (lower_gwas, upper_gwas) for _len, dosages in paired_dosages.items(): if not np.any(dosages != 0): continue p, lower, upper = weighted_binom_conf.weighted_binom_conf( dosages, ori_phenotypes, 0.05) paired_dosage_stat[_len] = p paired_dosage_95_CI[_len] = (lower, upper) _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf( dosages, ori_phenotypes, 5e-8) paired_dosage_GWAS_CI[_len] = (lower_gwas, upper_gwas) outfile.write( load_and_filter_genotypes.dict_str(single_dosage_stat) + '\t') outfile.write( load_and_filter_genotypes.dict_str(single_dosage_95_CI) + '\t') outfile.write( load_and_filter_genotypes.dict_str(single_dosage_GWAS_CI) + '\t') outfile.write( load_and_filter_genotypes.dict_str( {key: np.sum(arr) for key, arr in single_dosages.items()}) + '\t') outfile.write( load_and_filter_genotypes.dict_str(paired_dosage_stat) + '\t') outfile.write( load_and_filter_genotypes.dict_str(paired_dosage_95_CI) + '\t') outfile.write( load_and_filter_genotypes.dict_str(paired_dosage_GWAS_CI) + '\n') else: single_dosage_stat = {} single_dosage_95_CI = {} single_dosage_GWAS_CI = {} if not binary: for alt_count in range(3): mean_stats = statsmodels.stats.weightstats.DescrStatsW( ori_phenotypes, weights=dosage_gts[:, alt_count]) single_dosage_stat[alt_count] = mean_stats.mean single_dosage_95_CI[alt_count] = mean_stats.tconfint_mean() single_dosage_GWAS_CI[ alt_count] = mean_stats.tconfint_mean(5e-8) else: for alt_count in range(3): p, lower, upper = weighted_binom_conf.weighted_binom_conf( dosage_gts[:, alt_count], ori_phenotypes, 0.05) single_dosage_stat[alt_count] = p single_dosage_95_CI[alt_count] = (lower, upper) _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf( dosage_gts[:, alt_count], ori_phenotypes, 5e-8) single_dosage_GWAS_CI[alt_count] = (lower_gwas, upper_gwas) outfile.write( load_and_filter_genotypes.dict_str(single_dosage_stat) + '\t') outfile.write( load_and_filter_genotypes.dict_str(single_dosage_95_CI) + '\t') outfile.write( load_and_filter_genotypes.dict_str(single_dosage_GWAS_CI) + '\n') outfile.flush() duration = time.time() - start_time total_time += duration batch_time += duration if n_loci % batch_size == 0: print( f"time/locus (last {batch_size}): " f"{batch_time/batch_size}s\n" f"time/locus ({n_loci} total loci): {total_time/n_loci}s\n", flush=True) batch_time = 0 start_time = time.time() if n_loci > 0: print( f"Done.\nTotal loci: {n_loci}\nTotal time: {total_time}s\ntime/locus: {total_time/n_loci}s\n", flush=True) else: print(f"No variants found in the region {region}\n", flush=True)