Example #1
0
def plot_phenotype_by_sex(phenotype):
    shared_covars = np.load(f'{ukb}/traits/shared_covars/shared_covars.npy')
    pheno_data = np.load(f'{ukb}/traits/phenotypes/{phenotype}.npy')
    with open(f'{ukb}/traits/phenotypes/{phenotype}_unit.txt') as unit_file:
        unit = next(unit_file).strip()

    data = utils.merge_arrays(shared_covars, pheno_data)
    data = data[:, [shared_covars.shape[1], 1]]
    data = data[~np.any(np.isnan(data), axis=1), :]

    if len(np.unique(data[:, 0])) < 2000:
        plot_histogram(
            data, f'{phenotype} ({unit})',
            phenotype.capitalize() + ' x Sex distribution',
            f'{ukb}/traits/phenotypes/{phenotype}_distribution_by_sex.png', {
                1: 'male',
                2: 'female'
            })
    else:
        plot_1D_kde(
            data, f'{phenotype} ({unit})',
            phenotype.capitalize() + ' x Sex distribution',
            f'{ukb}/traits/phenotypes/{phenotype}_distribution_by_sex.png', {
                1: 'male',
                2: 'female'
            })
Example #2
0
def samples_array_with_indicator(sample_fname):
    '''
    sample_fname - a file with the first line 'ID' followed by one sample per line (7 digit number)
                  (no negatives or missings)
    '''
    all_samples = get_all_samples()

    with open(sample_fname) as samples_file:
        samples = np.array([line.strip() for line in samples_file][1:],
                           dtype=int).reshape(-1, 1)

    samples_indicator = np.concatenate((samples, samples), axis=1)
    samples_merge = utils.merge_arrays(all_samples, samples_indicator)
    assert samples_merge.shape[1] == 2
    return samples_merge
def main():  # noqa: D103
    parser = argparse.ArgumentParser()
    parser.add_argument('outprefix')
    parser.add_argument('pheno_data')
    parser.add_argument('samples')
    parser.add_argument('phenotype')
    parser.add_argument('ethnicity')
    parser.add_argument('--binary', default=False, action='store_true')
    args = parser.parse_args()

    with open(f'{args.outprefix}_README.txt', 'w') as readme:
        today = datetime.datetime.now().strftime("%Y_%m_%d")
        readme.write(f"Run date: {today}\n")
        readme.write(
            "Subsetting to samples with phenotype that passed sample_qc, "
            f"as denoted by the file: {args.samples}\n")
        readme.flush()

        data = np.load(args.pheno_data)
        with open(args.samples) as sample_file:
            next(sample_file)
            samples = np.array([int(sample.strip()) for sample in sample_file])
        samples = samples.reshape(-1, 1)
        data = utils.merge_arrays(samples, data)

        readme.write(
            "Standardizing covariates (subtracting mean, then dividing by standard deviation)\n"
        )
        readme.flush()
        covariates = data[:, 2:]
        standardized_covariates = \
                (covariates - covariates.mean(axis=0))/covariates.std(axis=0)

        if not args.binary:
            ranks = rank_phenotypes(readme, data)
            rin_ranks = inverse_normalize_ranks(readme, ranks)
            transformed_data = np.concatenate(
                (samples, rin_ranks.reshape(-1, 1), standardized_covariates),
                axis=1)
        else:
            readme.write(
                "Binary outcome is left untransformed (0=case, 1=control)\n")
            phenotype = data[:, 1:2]
            transformed_data = np.concatenate(
                (samples, phenotype, standardized_covariates), axis=1)

        np.save(f'{args.outprefix}.npy', transformed_data)
parser.add_argument('outfname')
args = parser.parse_args()

with open(args.samples_fname) as samples_file:
    samples = np.array([line.strip() for line in samples_file][1:],
                       dtype=int).reshape(-1, 1)

imp_snp_samples_filepath = f'{ukb}/array_imputed/ukb46122_imp_chr1_v3_s487283.sample'
with open(imp_snp_samples_filepath) as imp_snp_samples_file:
    imp_snp_samples = np.array([
        line.split()[0] for line in imp_snp_samples_file
    ][2:],
                               dtype=int).reshape(-1, 1)

samples_indicator = np.concatenate((samples, samples), axis=1)
samples_merge = utils.merge_arrays(imp_snp_samples, samples_indicator)
assert samples_merge.shape[1] == 2
sample_idx = ~np.isnan(samples_merge[:, 1])

strs = lfg.load_strs('first_pass',
                     f'{args.chrom}:{args.pos}-{args.pos}',
                     sample_idx,
                     details=False)

single_chrom_dosages = next(strs)[0]
summed_dosages = collections.defaultdict(lambda: np.zeros(np.sum(sample_idx)))
for len1, dosages1 in single_chrom_dosages.items():
    #print(len1, dosages1)
    for len2, dosages2 in single_chrom_dosages.items():
        if len2 < len1:
            continue
parser = argparse.ArgumentParser()
parser.add_argument('phenotype')
parser.add_argument('--conditional')
parser.add_argument('--binary', default=False, choices={'logistic', 'linear'})
args = parser.parse_args()

phenotype = args.phenotype

shared_covars = np.load(f'{ukb}/traits/shared_covars/shared_covars.npy')

subset_transformed_phenotype = np.load(f'{ukb}/traits/subset_transformed_phenotypes/white_brits/{phenotype}.npy')

# shared covars here aren't necessarily going to have exactly mean 0 and std 1
# because they were standardized before subsetting, but that's okay
data = utils.merge_arrays(
    subset_transformed_phenotype,
    shared_covars
)
data = np.concatenate((data[:, 0:1], data), axis=1)

col_names = ['FID', 'IID']
if not args.binary:
    col_names.append(f'rin_{phenotype}')
else:
    col_names.append(phenotype)
    if args.binary == 'logistic':
        # plink expects and ouptuts a 1=control, 2=case encoding
        # instead of the 0=control, 1=case encoding we use elsewhere
        # (from: https://www.cog-genomics.org/plink/2.0/input
        # under the `--1` section)
        data[:, 2] += 1
    else:
Example #6
0
    today = datetime.datetime.now().strftime("%Y_%m_%d")
    data_fname = f'{ukb}/main_dataset/extracted_data/{phenotype}_{args.phenotype_field_id}.txt'
    readme.write(f"Run date: {today}\n")
    readme.write(f"Loading phenotype {phenotype} from txt file "
                 f" {data_fname} \n")

    data = np.genfromtxt(data_fname, skip_header=1, delimiter='\t')[:, 1:-1]
    # drop first and last rows which because of the way data is extracted
    # and then read by numpy are always nans

    readme.write(f"Subsetting to samples at {args.samples}\n")
    # load samples that have passed qc
    samples = np.genfromtxt(args.samples, skip_header=1)

    # subset to those samples
    data = utils.merge_arrays(samples.reshape(-1, 1), data)

    # number of sample with this phenotype at any assessment
    num_samples = np.sum(np.any(~np.isnan(data[:, 1:]), axis=1))

    # drop samples with categorical covars with less than this
    # number of samples or fraction of samples
    cat_drop_num = 50
    cat_drop_frac = 0.001  #0.1%

    covar_datas = []
    reverse_covar_hashes = []
    for covar in args.categorical_covars:
        covar_name, covar_id = covar.split(',')
        covar_fname = f'{ukb}/main_dataset/extracted_data/{covar_name}_{covar_id}.txt'
        readme.write(
    ])
    assert result.shape[0] == 1
        
    pheno_data = np.load(pheno_datas_d[ethnicity])

    bgen_samples = []
    with open(f'{ukb}/microarray/ukb46122_hap_chr1_v2_s487314.sample') as samplefile:
        for num, line in enumerate(samplefile):
            if num <= 1:
                # skip first two lines
                continue
            bgen_samples.append(line.split()[0])
    assert len(bgen_samples) == 487409
    samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1)

    merged_arr = utils.merge_arrays(samples_array, pheno_data)
    unfiltered_subset = ~np.isnan(merged_arr[:, 1])
    n_samples = np.sum(unfiltered_subset)

    vcf = cyvcf2.VCF(args.imputed_vcf)
    found_rec = False
    for record in vcf(f'{args.chrom}:{args.pos}-{args.pos}'):
        if record.POS < args.pos:
            continue
        if record.INFO.get('PERIOD') is None:
            continue

        assert not found_rec
        found_rec = True

        trrecord = trh.HarmonizeRecord(vcfrecord=record, vcftype='beagle-hipstr')
    if not args.zero_one_neg_nan:
        # cols: id, date first reported
        data = load_date_data_field(data_fname)
        # cols: id, is_case
        data[:, 1] = ~np.isnan(data[:, 1])
    else:
        # cols: id, is_case
        data = load_0_1_neg_nan_field(data_fname)

    readme.write(f"Subsetting to samples at {args.samples}\n")
    # load samples that have passed qc
    samples = np.genfromtxt(args.samples, skip_header=1)

    # TODO test this
    # subset to those samples
    data = utils.merge_arrays(samples.reshape(-1, 1), data)

    year_of_birth = np.genfromtxt(
        f'{ukb}/main_dataset/extracted_data/year_of_birth_34.txt',
        delimiter='\t',
        skip_header=1)[:, 1:-1]
    month_of_birth = np.genfromtxt(
        f'{ukb}/main_dataset/extracted_data/month_of_birth_52.txt',
        delimiter='\t',
        skip_header=1)[:, 1:-1]

    # cols: id, is case, year of birth, month of birth
    data = utils.merge_arrays(utils.merge_arrays(data, year_of_birth),
                              month_of_birth)
    missing_birth = np.isnan(data[:, 2]) | np.isnan(data[:, 3])
    assert np.sum(missing_birth) < 100
Example #9
0
def generate_figure(assoc_results_fname, pheno_data_fname, chrom, pos,
                    phenotype, dosage_fraction_threshold, unit, binary,
                    publication):

    assert bool(unit) or binary

    assert 0 <= dosage_fraction_threshold <= 1

    if not binary:
        y_axis_label = 'Mean ' + phenotype.replace('_', ' ') + f' ({unit})'
    else:
        y_axis_label = 'Fraction ' + phenotype.replace('_', ' ') + ' cases'

    figure = bokeh.plotting.figure(
        width=600,
        height=600,
        y_axis_label=y_axis_label,
        x_axis_label='Sum of allele lengths (repeat copies)')
    figure.grid.grid_line_color = None
    figure.background_fill_color = None
    figure.border_fill_color = None
    figure.toolbar_location = None
    figure.title.text_font_size = '18px'
    figure.axis.axis_label_text_font_size = '18px'
    figure.axis.major_label_text_font_size = '14px'

    if not binary:
        stat_name = 'mean'
    else:
        stat_name = 'fraction'

    def fix_header(header):
        def fix_header_helper(_):
            part1 = header.rpartition('0.05_significance_CI')
            fix1 = part1[0] + 'foo' + part1[2]
            part2 = fix1.rpartition('5e-8_significance_CI')
            fix2 = part2[0] + 'bar' + part2[2]
            return fix2.split('\t')

        return fix_header_helper

    with open(assoc_results_fname) as tsv:
        header = tsv.readline().strip()
    result = pl.scan_csv(
        assoc_results_fname,
        sep='\t',
        dtypes={
            'locus_filtered': str
        },
        skip_rows=1,
        has_header=False,
        with_column_names=fix_header(header)).filter(
            (pl.col('chrom') == chrom)
            & (pl.col('pos') == pos)).collect().select(
                [  # have to collect first due to some sort of bug
                    'motif', '0.05_significance_CI', '5e-8_significance_CI',
                    f'{stat_name}_{phenotype}_per_single_dosage',
                    'total_subset_dosage_per_summed_gt'
                ])
    assert result.shape[0] == 1

    pheno_data = np.load(pheno_data_fname)

    bgen_samples = sample_utils.get_all_samples()
    assert len(bgen_samples) == 487409
    samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1)

    merged_arr = utils.merge_arrays(samples_array, pheno_data)
    unfiltered_subset = ~np.isnan(merged_arr[:, 1])
    n_samples = np.sum(unfiltered_subset)

    subset_summed_dosage_fractions = {
        float(allele): val
        for allele, val in ast.literal_eval(
            result['total_subset_dosage_per_summed_gt'].to_numpy()[0]).items()
    }
    total_dosage = np.sum(list(subset_summed_dosage_fractions.values()))
    subset_summed_dosage_fractions = {
        key: val / total_dosage
        for key, val in subset_summed_dosage_fractions.items()
    }

    alleles = list(subset_summed_dosage_fractions.keys())
    alleles_copy = alleles.copy()
    for allele in alleles_copy:
        if subset_summed_dosage_fractions[allele] < dosage_fraction_threshold:
            alleles.remove(allele)
    alleles = sorted(alleles)

    mean_per_dosage = {
        float(allele): val
        for allele, val in ast.literal_eval(
            result[f'{stat_name}_{phenotype}_per_single_dosage'].to_numpy()
            [0]).items()
    }
    ci5e_2 = {
        float(allele): val
        for allele, val in ast.literal_eval(
            result['0.05_significance_CI'].to_numpy()[0]).items()
    }
    ci5e_8 = {
        float(allele): val
        for allele, val in ast.literal_eval(
            result['5e-8_significance_CI'].to_numpy()[0]).items()
    }
    y_min = min(ci5e_8[allele][0] for allele in alleles)
    y_max = max(ci5e_8[allele][1] for allele in alleles)

    figure.varea(alleles, [ci5e_2[allele][1] for allele in alleles],
                 [ci5e_8[allele][1] for allele in alleles],
                 color="red",
                 alpha=0.2,
                 legend_label='1 - 5e-8 Confidence Interval')
    figure.varea(alleles, [ci5e_2[allele][0] for allele in alleles],
                 [ci5e_2[allele][1] for allele in alleles],
                 color="red",
                 alpha=0.4,
                 legend_label='0.95 Confidence Interval')
    figure.varea(alleles, [ci5e_8[allele][0] for allele in alleles],
                 [ci5e_2[allele][0] for allele in alleles],
                 color="red",
                 alpha=0.2)
    figure.line(alleles, [mean_per_dosage[allele] for allele in alleles],
                line_width=2,
                color="black")
    figure.circle(alleles, [mean_per_dosage[allele] for allele in alleles],
                  color="black",
                  size=6,
                  legend_label='mean')
    figure.legend.label_text_font_size = '10px'

    figure.y_range = bokeh.models.Range1d(y_min - 0.05 * (y_max - y_min),
                                          y_max + 0.05 * (y_max - y_min))

    figure.add_layout(
        bokeh.models.Title(text=f'STR {chrom}:{pos}',
                           align="center",
                           text_font_size='18px'), "above")
    figure.add_layout(
        bokeh.models.Title(text=phenotype.replace('_', ' ').capitalize() +
                           " vs genotype",
                           align="center",
                           text_font_size='18px'), "above")

    if not publication:
        figure.add_layout(
            bokeh.models.Title(
                text="Phenotype values are unadjusted for covariates",
                align="center"), "below")
        figure.add_layout(
            bokeh.models.Title(
                text=
                "People contribute to each genotype based on their prob. of having that genotype",
                align="center"), "below")
        figure.add_layout(
            bokeh.models.Title(text="Only considers tested individuals",
                               align="center"), "below")
        figure.add_layout(
            bokeh.models.Title(
                text=
                f"Genotypes with dosages less than {100*dosage_fraction_threshold}% of the population are omitted",
                align="center"), "below")

    return figure
Example #10
0
def main():
    # do an association test of the combined dosage of the listed alleles vs the listed phenotypes in each ethnicity
    parser = argparse.ArgumentParser()
    #parser.add_argument('chrom', type=int)
    #parser.add_argument('pos', type=int)
    parser.add_argument('var_file')
    #parser.add_argument('--phenotypes', nargs='+')
    #parser.add_argument('--alleles', type=int, nargs='+') #allele indicies, i.e. ths SNP is present in alleles 0 (ref), 2 (2nd alt) and 5
    args = parser.parse_args()

    print(
        'str\tvariant\tethnicity\tvar frequency\tstr var r2\tphenotype\tstr p-val\tvar p-val\tstr p-val conditioning on var'
    )

    scovs = np.load(f'{ukb}/traits/shared_covars/shared_covars.npy')

    variants = pl.read_csv(args.var_file, sep='\t')
    for i in range(variants.shape[0]):
        chrom = variants[i, 'chrom']
        pos = variants[i, 'pos']
        str_ = f'{chrom}:{pos}'
        var = next(
            cyvcf2.VCF(
                f'{ukb}/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chrom}.vcf.gz'
            )(str_))
        alleles = [int(num) for num in variants[i, 'alleles'].split(',')]
        name = variants[i, 'name']
        phenotypes = variants[i, 'phenos'].split(',')
        for phenotype in phenotypes:
            for ethnicity in ('white_brits', 'black', 'south_asian', 'chinese',
                              'irish', 'white_other'):
                #ethnicity
                total_samp_idx = sample_utils.get_samples_idx_ethnicity(
                    ethnicity)
                total_var_gts = var_dosage_gts(var, total_samp_idx, alleles)
                total_str_gts = str_dosage_gts(var, total_samp_idx)
                var_freq = np.sum(total_var_gts) / (2 * total_var_gts.shape[0])
                corr = np.corrcoef(total_var_gts, total_str_gts)
                assert corr.shape == (2, 2)
                str_var_r2 = corr[0, 1]**2
                #ethnicity,pheno
                pcovs = np.load(
                    f'{ukb}/traits/subset_transformed_phenotypes/{ethnicity}/{phenotype}.npy'
                )
                samps = sample_utils.get_ordered_samples_phenotype(
                    ethnicity, phenotype).reshape(-1, 1)
                covs = python_array_utils.merge_arrays(
                    python_array_utils.merge_arrays(samps, pcovs), scovs)

                outcomes = covs[:, 1]
                covs = covs[:, 2:]

                samp_idx = sample_utils.get_samples_idx_phenotype(
                    ethnicity, phenotype)
                var_gts = standardize(var_dosage_gts(var, samp_idx, alleles))
                str_gts = standardize(str_dosage_gts(var, samp_idx))
                str_best_guess_gts = trh.HarmonizeRecord(
                    vcfrecord=var,
                    vcftype='beagle-hipstr').GetGenotypeIndicies()[
                        samp_idx, :-1]

                str_p = OLS(
                    outcomes,
                    np.hstack((covs, np.ones((covs.shape[0], 1)),
                               str_gts.reshape(-1, 1)))).fit().pvalues[-1]

                if np.all(var_gts == 0) or np.all(var_gts == 2):
                    var_p = 1
                    str_cond_p = str_p
                else:
                    var_p = OLS(
                        outcomes,
                        np.hstack((covs, np.ones((covs.shape[0], 1)),
                                   var_gts.reshape(-1, 1)))).fit().pvalues[-1]
                    str_cond_p = OLS(
                        outcomes,
                        np.hstack((covs, np.ones(
                            (covs.shape[0], 1)), var_gts.reshape(-1, 1),
                                   str_gts.reshape(-1, 1)))).fit().pvalues[-1]

                print(
                    f'{str_}\t{name}\t{ethnicity}\t{var_freq:.3g}\t{str_var_r2:.3g}\t{phenotype}\t{str_p:.3g}\t{var_p:.3g}\t{str_cond_p:.3g}'
                )
Example #11
0
scovs = np.load(f'{ukb}/traits/shared_covars/shared_covars.npy')
print(
    'phenotype\tethnicity\tcompound coeff single test (s.d.)\tA coeff (s.d.)\tTA coeff (s.d.)\tCA coeff (s.d.)'
)
for phenotype in ('mean_platelet_volume', 'platelet_distribution_width',
                  'platelet_count'):
    for ethnicity in ('white_brits', 'black', 'south_asian', 'chinese',
                      'irish', 'white_other'):
        print(f'{phenotype}\t{ethnicity}', end='')
        pcovs = np.load(
            f'{ukb}/traits/subset_transformed_phenotypes/{ethnicity}/{phenotype}.npy'
        )
        samps = sample_utils.get_ordered_samples_phenotype(
            ethnicity, phenotype).reshape(-1, 1)
        covs = python_array_utils.merge_arrays(
            python_array_utils.merge_arrays(samps, pcovs), scovs)

        outcomes = covs[:, 1]
        covs = covs[:, 2:]

        samp_idx = sample_utils.get_samples_idx_phenotype(ethnicity, phenotype)
        itr = load_PACSIN2.get_gt_itr(samp_idx)
        next(itr)  # skip details
        compound, a, ta, ca = [next(itr)[0] for _ in range(4)]
        compound, a, ta, ca = [
            np.sum([
                len_ * np.sum(dosages, axis=1)
                for len_, dosages in dosage_dict.items()
            ],
                   axis=0).reshape(-1, 1)
            for dosage_dict in (compound, a, ta, ca)
Example #12
0
def perform_regional_gwas_helper(outfile,
                                 pheno_and_covars_fname,
                                 shared_covars_fname,
                                 untransformed_phenotypes_fname,
                                 get_genotype_iter,
                                 phenotype,
                                 binary,
                                 region,
                                 runtype,
                                 conditional_covars_fname=None):

    outfile.write("chrom\tpos\talleles\tlocus_filtered\t"
                  f"p_{phenotype}\tcoeff_{phenotype}\t")
    if binary != 'logistic':
        outfile.write(f'se_{phenotype}\tR^2\t')
    else:
        outfile.write("unused_col\tunused_col\t")
    outfile.flush()

    n_loci = 0
    batch_time = 0
    batch_size = 50
    total_time = 0

    pheno_specific_covars = np.load(pheno_and_covars_fname)
    shared_covars = np.load(shared_covars_fname)
    covars = utils.merge_arrays(pheno_specific_covars, shared_covars)

    if conditional_covars_fname:
        gt_covars = np.load(conditional_covars_fname)
        covars = utils.merge_arrays(covars, gt_covars)

    # order samples according to order in genetics files
    bgen_samples = sample_utils.get_all_samples()
    assert len(bgen_samples) == 487409
    samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1)
    merge = utils.merge_arrays(samples_array, covars)
    unfiltered_samples = ~np.isnan(merge[:, 1])

    outcome = merge[unfiltered_samples, 1].copy()
    covars = merge[unfiltered_samples, :]
    covars = (covars - np.mean(covars, axis=0)) / np.std(covars, axis=0)
    covars[:, 1] = 1  # reuse the column that was the outcome as the intercept

    ori_phenotypes = np.load(untransformed_phenotypes_fname)
    ori_phenotypes = utils.merge_arrays(samples_array, ori_phenotypes)[:, 1]
    ori_phenotypes = ori_phenotypes[unfiltered_samples]

    # first yield is special
    genotype_iter = get_genotype_iter(unfiltered_samples)
    extra_detail_fields = next(genotype_iter)
    outfile.write('\t'.join(extra_detail_fields) + '\t')

    if not binary:
        stat = 'mean'
    else:
        stat = 'fraction'

    outfile.write(f'{stat}_{phenotype}_per_single_dosage\t'
                  '0.05_significance_CI\t'
                  '5e-8_significance_CI')

    if runtype == 'strs':
        outfile.write('\ttotal_subset_dosage_per_summed_gt\t'
                      f'{stat}_{phenotype}_per_paired_dosage\t'
                      '0.05_significance_CI\t'
                      '5e-8_significance_CI')
    outfile.write('\n')
    outfile.flush()

    start_time = time.time()
    for dosage_gts, unique_alleles, chrom, pos, locus_filtered, locus_details in genotype_iter:
        assert len(locus_details) == len(extra_detail_fields)

        covars[:,
               0] = np.nan  # reuse the column that was the ids as the genotypes

        n_loci += 1
        allele_names = ','.join(list(unique_alleles.astype(str)))
        outfile.write(f"{chrom}\t{pos}\t{allele_names}\t")
        if locus_filtered:
            outfile.write(f'{locus_filtered}\t1\tnan\tnan\tnan\t')
            outfile.write('\t'.join(locus_details))
            if runtype == 'strs':
                outfile.write('\tnan' * 6 + '\n')
            else:
                outfile.write('\tnan' * 3 + '\n')
            outfile.flush()
            continue
        else:
            outfile.write('False\t')

        if runtype == 'strs':
            gts = np.sum([
                _len * np.sum(dosages, axis=1)
                for _len, dosages in dosage_gts.items()
            ],
                         axis=0)
        else:
            gts = dosage_gts[:, 1] + 2 * dosage_gts[:, 2]
        std = np.std(gts)
        gts = (gts - np.mean(gts)) / np.std(gts)
        covars[:, 0] = gts

        if not binary or binary == 'linear':
            #do da regression
            model = OLS(
                outcome,
                covars,
                missing='drop',
            )
            reg_result = model.fit()
            pval = reg_result.pvalues[0]
            coef = reg_result.params[0]
            se = reg_result.bse[0]
            rsquared = reg_result.rsquared
            outfile.write(f"{pval:.2e}\t{coef/std}\t{se/std}\t{rsquared}\t")
        else:
            model = sm.GLM(outcome,
                           covars,
                           missing='drop',
                           family=sm.families.Binomial())
            reg_result = model.fit()
            pval = reg_result.pvalues[0]
            coef = reg_result.params[0]
            outfile.write(f'{pval:.2e}\t{coef/std}\tnan\tnan\t')

        outfile.write('\t'.join(locus_details) + '\t')

        if runtype == 'strs':
            single_dosages = {}

            paired_dosages = {}
            for len1 in unique_alleles:
                for len2 in unique_alleles:
                    if len1 > len2:
                        continue
                    if len1 != len2:
                        dosages = (
                            dosage_gts[len1][:, 0] * dosage_gts[len2][:, 1] +
                            dosage_gts[len1][:, 1] * dosage_gts[len2][:, 0])
                    else:
                        dosages = dosage_gts[len1][:, 0] * dosage_gts[len1][:,
                                                                            1]
                    if np.sum(dosages) <= 0:
                        continue
                    summed_len = round(len1 + len2, 2)
                    if summed_len not in single_dosages:
                        single_dosages[summed_len] = dosages
                    else:
                        single_dosages[summed_len] += dosages
                    minlen = min(len1, len2)
                    maxlen = max(len1, len2)
                    paired_dosages[(minlen, maxlen)] = dosages
            single_dosage_stat = {}
            single_dosage_95_CI = {}
            single_dosage_GWAS_CI = {}
            paired_dosage_stat = {}
            paired_dosage_95_CI = {}
            paired_dosage_GWAS_CI = {}
            if not binary:
                for _len, dosages in single_dosages.items():
                    if len(np.unique(ori_phenotypes[dosages != 0])) <= 1:
                        continue
                    mean_stats = statsmodels.stats.weightstats.DescrStatsW(
                        ori_phenotypes, weights=dosages)
                    single_dosage_stat[_len] = mean_stats.mean
                    single_dosage_95_CI[_len] = mean_stats.tconfint_mean()
                    single_dosage_GWAS_CI[_len] = mean_stats.tconfint_mean(
                        5e-8)
                for _len, dosages in paired_dosages.items():
                    if len(np.unique(ori_phenotypes[dosages != 0])) <= 1:
                        continue
                    mean_stats = statsmodels.stats.weightstats.DescrStatsW(
                        ori_phenotypes, weights=dosages)
                    paired_dosage_stat[_len] = mean_stats.mean
                    paired_dosage_95_CI[_len] = mean_stats.tconfint_mean()
                    paired_dosage_GWAS_CI[_len] = mean_stats.tconfint_mean(
                        5e-8)
            else:
                for _len, dosages in single_dosages.items():
                    if not np.any(dosages != 0):
                        continue
                    p, lower, upper = weighted_binom_conf.weighted_binom_conf(
                        dosages, ori_phenotypes, 0.05)
                    single_dosage_stat[_len] = p
                    single_dosage_95_CI[_len] = (lower, upper)
                    _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf(
                        dosages, ori_phenotypes, 5e-8)
                    single_dosage_GWAS_CI[_len] = (lower_gwas, upper_gwas)
                for _len, dosages in paired_dosages.items():
                    if not np.any(dosages != 0):
                        continue
                    p, lower, upper = weighted_binom_conf.weighted_binom_conf(
                        dosages, ori_phenotypes, 0.05)
                    paired_dosage_stat[_len] = p
                    paired_dosage_95_CI[_len] = (lower, upper)
                    _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf(
                        dosages, ori_phenotypes, 5e-8)
                    paired_dosage_GWAS_CI[_len] = (lower_gwas, upper_gwas)
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_stat) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_95_CI) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_GWAS_CI) +
                '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(
                    {key: np.sum(arr)
                     for key, arr in single_dosages.items()}) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(paired_dosage_stat) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(paired_dosage_95_CI) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(paired_dosage_GWAS_CI) +
                '\n')
        else:
            single_dosage_stat = {}
            single_dosage_95_CI = {}
            single_dosage_GWAS_CI = {}
            if not binary:
                for alt_count in range(3):
                    mean_stats = statsmodels.stats.weightstats.DescrStatsW(
                        ori_phenotypes, weights=dosage_gts[:, alt_count])
                    single_dosage_stat[alt_count] = mean_stats.mean
                    single_dosage_95_CI[alt_count] = mean_stats.tconfint_mean()
                    single_dosage_GWAS_CI[
                        alt_count] = mean_stats.tconfint_mean(5e-8)
            else:
                for alt_count in range(3):
                    p, lower, upper = weighted_binom_conf.weighted_binom_conf(
                        dosage_gts[:, alt_count], ori_phenotypes, 0.05)
                    single_dosage_stat[alt_count] = p
                    single_dosage_95_CI[alt_count] = (lower, upper)
                    _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf(
                        dosage_gts[:, alt_count], ori_phenotypes, 5e-8)
                    single_dosage_GWAS_CI[alt_count] = (lower_gwas, upper_gwas)
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_stat) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_95_CI) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_GWAS_CI) +
                '\n')

        outfile.flush()

        duration = time.time() - start_time
        total_time += duration
        batch_time += duration
        if n_loci % batch_size == 0:
            print(
                f"time/locus (last {batch_size}): "
                f"{batch_time/batch_size}s\n"
                f"time/locus ({n_loci} total loci): {total_time/n_loci}s\n",
                flush=True)
            batch_time = 0
        start_time = time.time()
    if n_loci > 0:
        print(
            f"Done.\nTotal loci: {n_loci}\nTotal time: {total_time}s\ntime/locus: {total_time/n_loci}s\n",
            flush=True)
    else:
        print(f"No variants found in the region {region}\n", flush=True)