Ejemplo n.º 1
0
def main(args):
    """Find doubleton pairs and compare to related pairs."""
    try:
        hl.init(log="/test_doubletons_relatedness.log",
                default_reference="GRCh38")
        compare_doubletons_to_related()

    finally:
        logger.info("Copying hail log to logging bucket...")
        hl.copy_log(args.temp_path)
Ejemplo n.º 2
0
def copy_logs_output(log_dir, log_file, plot_dir):
    if not log_dir.endswith("/"):
        log_dir = log_dir + "/"

    datestr = time.strftime("%Y.%m.%d")
    hail_log_name = os.path.join(log_dir, datestr + "_hail_log.txt")
    hl.copy_log(hail_log_name)

    cmd = ['gsutil', 'cp', log_file, log_dir]
    subprocess.call(cmd)
    cmd = ['gsutil', 'cp', '*.html', plot_dir]
    subprocess.call(cmd)
    cmd = ['gsutil', 'cp', '*.pdf', plot_dir]
    subprocess.call(cmd)
Ejemplo n.º 3
0
 def test_hadoop_copy_log(self):
     with with_local_temp_file('log') as r:
         hl.copy_log(r)
         stats = hl.hadoop_stat(r)
         self.assertTrue(stats['size_bytes'] > 0)
Ejemplo n.º 4
0
 def test_hadoop_copy_log(self):
     r = resource('copy_log_test.txt')
     hl.copy_log(r)
     stats = hl.hadoop_stat(r)
     self.assertTrue(stats['size_bytes'] > 0)
Ejemplo n.º 5
0
def main(args):
    subsets = args.subsets
    hl.init(
        log=
        f"/generate_frequency_data{'.' + '_'.join(subsets) if subsets else ''}.log",
        default_reference="GRCh38",
    )

    invalid_subsets = []
    n_subsets_use_subpops = 0
    for s in subsets:
        if s not in SUBSETS:
            invalid_subsets.append(s)
        if s in COHORTS_WITH_POP_STORED_AS_SUBPOP:
            n_subsets_use_subpops += 1

    if invalid_subsets:
        raise ValueError(
            f"{', '.join(invalid_subsets)} subset(s) are not one of the following official subsets: {SUBSETS}"
        )
    if n_subsets_use_subpops & (n_subsets_use_subpops != len(subsets)):
        raise ValueError(
            f"All or none of the supplied subset(s) should be in the list of cohorts that need to use subpops instead "
            f"of pops in frequency calculations: {COHORTS_WITH_POP_STORED_AS_SUBPOP}"
        )

    try:
        logger.info("Reading full sparse MT and metadata table...")
        mt = get_gnomad_v3_mt(
            key_by_locus_and_alleles=True,
            release_only=not args.include_non_release,
            samples_meta=True,
        )

        if args.test:
            logger.info("Filtering to two partitions on chr20")
            mt = hl.filter_intervals(
                mt, [hl.parse_locus_interval("chr20:1-1000000")])
            mt = mt._filter_partitions(range(2))

        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

        if args.include_non_release:
            logger.info("Filtering MT columns to high quality samples")
            total_sample_count = mt.count_cols()
            mt = mt.filter_cols(mt.meta.high_quality)
            high_quality_sample_count = mt.count_cols()
            logger.info(
                f"Filtered {total_sample_count - high_quality_sample_count} from the full set of {total_sample_count} "
                f"samples...")

        if subsets:
            mt = mt.filter_cols(hl.any([mt.meta.subsets[s] for s in subsets]))
            logger.info(
                f"Running frequency generation pipeline on {mt.count_cols()} samples in {', '.join(subsets)} subset(s)..."
            )
        else:
            logger.info(
                f"Running frequency generation pipeline on {mt.count_cols()} samples..."
            )

        logger.info("Computing adj and sex adjusted genotypes...")
        mt = mt.annotate_entries(
            GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT,
                                        mt.meta.sex_imputation.sex_karyotype),
            adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD),
        )

        logger.info("Densify-ing...")
        mt = hl.experimental.densify(mt)
        mt = mt.filter_rows(hl.len(mt.alleles) > 1)

        # Temporary hotfix for depletion of homozygous alternate genotypes
        logger.info(
            "Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt..."
        )
        # Load v3.0 allele frequencies to avoid an extra frequency calculation
        # NOTE: Using previous callset AF works for small incremental changes to a callset, but we will need to revisit for large increments
        freq_ht = get_freq(version="3").ht()
        freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF)

        mt = mt.annotate_entries(GT=hl.cond(
            (freq_ht[mt.row_key].AF > 0.01)
            & mt.GT.is_het()
            & (mt.AD[1] / mt.DP > 0.9),
            hl.call(1, 1),
            mt.GT,
        ))

        logger.info("Generating frequency data...")
        if subsets:
            mt = annotate_freq(
                mt,
                sex_expr=mt.meta.sex_imputation.sex_karyotype,
                pop_expr=mt.meta.population_inference.pop
                if not n_subsets_use_subpops else
                mt.meta.project_meta.project_subpop,
                # NOTE: TGP and HGDP labeled populations are highly specific and are stored in the project_subpop meta field
            )

            # NOTE: no FAFs or popmax needed for subsets
            mt = mt.select_rows("freq")

            logger.info(
                f"Writing out frequency data for {', '.join(subsets)} subset(s)..."
            )
            if args.test:
                mt.rows().write(
                    get_checkpoint_path(
                        f"chr20_test_freq.{'_'.join(subsets)}"),
                    overwrite=True,
                )
            else:
                mt.rows().write(get_freq(subset="_".join(subsets)).path,
                                overwrite=args.overwrite)

        else:
            logger.info("Computing age histograms for each variant...")
            mt = mt.annotate_cols(age=hl.if_else(
                hl.is_defined(mt.meta.project_meta.age),
                mt.meta.project_meta.age,
                mt.meta.project_meta.age_alt,
                # NOTE: most age data is stored as integers in 'age' annotation, but for a select number of samples, age is stored as a bin range and 'age_alt' corresponds to an integer in the middle of the bin
            ))
            mt = mt.annotate_rows(**age_hists_expr(mt.adj, mt.GT, mt.age))

            # Compute callset-wide age histogram global
            mt = mt.annotate_globals(age_distribution=mt.aggregate_cols(
                hl.agg.hist(mt.age, 30, 80, 10)))

            mt = annotate_freq(
                mt,
                sex_expr=mt.meta.sex_imputation.sex_karyotype,
                pop_expr=mt.meta.population_inference.pop,
                downsamplings=DOWNSAMPLINGS,
            )
            # Remove all loci with raw AC=0
            mt = mt.filter_rows(mt.freq[1].AC > 0)

            logger.info("Calculating InbreedingCoeff...")
            # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify
            mt = mt.annotate_rows(
                InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

            logger.info("Computing filtering allele frequencies and popmax...")
            faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus,
                                     POPS_TO_REMOVE_FOR_POPMAX)
            mt = mt.select_rows(
                "InbreedingCoeff",
                "freq",
                faf=faf,
                popmax=pop_max_expr(mt.freq, mt.freq_meta,
                                    POPS_TO_REMOVE_FOR_POPMAX),
            )
            mt = mt.annotate_globals(
                faf_meta=faf_meta,
                faf_index_dict=make_faf_index_dict(faf_meta))
            mt = mt.annotate_rows(popmax=mt.popmax.annotate(
                faf95=mt.faf[mt.faf_meta.index(
                    lambda x: x.values() == ["adj", mt.popmax.pop])].faf95))

            logger.info("Annotating quality metrics histograms...")
            # NOTE: these are performed here as the quality metrics histograms also require densifying
            mt = mt.annotate_rows(
                qual_hists=qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD, mt.adj))
            ht = mt.rows()
            ht = ht.annotate(
                qual_hists=hl.Struct(
                    **{
                        i.replace("_adj", ""): ht.qual_hists[i]
                        for i in ht.qual_hists if "_adj" in i
                    }),
                raw_qual_hists=hl.Struct(**{
                    i: ht.qual_hists[i]
                    for i in ht.qual_hists if "_adj" not in i
                }),
            )

            logger.info("Writing out frequency data...")
            if args.test:
                ht.write(get_checkpoint_path("chr20_test_freq"),
                         overwrite=True)
            else:
                ht.write(get_freq().path, overwrite=args.overwrite)

    finally:
        logger.info("Copying hail log to logging bucket...")
        hl.copy_log(f"{qc_temp_prefix()}logs/")
Ejemplo n.º 6
0
                        help='Radius of window for LD matrix')
    parser.add_argument('--ld-score-radius',
                        type=int,
                        default=1e6,
                        help='Radius of window for LD score')
    parser.add_argument('--write-mt',
                        action='store_true',
                        help='Write MatrixTable from bgen')
    parser.add_argument('--write-bm',
                        action='store_true',
                        help='Write BlockMatrix from MatrixTable')
    parser.add_argument('--compute-ld-matrix',
                        action='store_true',
                        help='Compute LD matrix')
    parser.add_argument('--compute-ldscore',
                        action='store_true',
                        help='Compute LD score')
    parser.add_argument('--write-ldsc-hm3-snplist',
                        action='store_true',
                        help='Write QCed HM3 snplist for ldsc')
    parser.add_argument('--overwrite',
                        action='store_true',
                        help='Overwrite data')
    args = parser.parse_args()

    atexit.register(lambda: hl.copy_log(
        timestamp_path(f'gs://ukb-diverse-pops/ld/{args.pop}/ld',
                       suffix='.log')))

    main(args)
Ejemplo n.º 7
0
def main(args):
    hl.init()

    # Read in all sumstats
    mt = load_final_sumstats_mt(filter_phenos=True,
                                filter_variants=False,
                                filter_sumstats=True,
                                separate_columns_by_pop=False,
                                annotate_with_nearest_gene=False)

    # Annotate per-entry sample size
    def get_n(pheno_data, i):
        return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0)

    mt = mt.annotate_entries(summary_stats=hl.map(
        lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]),
                                                get_n(mt.pheno_data, x[0]))),
        hl.zip_with_index(mt.summary_stats)))

    # Exclude entries with low confidence flag.
    if not args.keep_low_confidence_variants:
        mt = mt.annotate_entries(summary_stats=hl.map(
            lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats))

    # Run fixed-effect meta-analysis (all + leave-one-out)
    mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA /
                             (mt.summary_stats.SE**2),
                             inv_se2=1 / (mt.summary_stats.SE**2))
    mt = mt.annotate_entries(
        sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta,
                                              mt.pheno_data.pop),
        sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop))
    mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2,
                              META_SE=hl.map(lambda x: hl.sqrt(1 / x),
                                             mt.sum_inv_se2))
    mt = mt.annotate_entries(
        META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA /
                                                              mt.META_SE)))

    # Run heterogeneity test (Cochran's Q)
    mt = mt.annotate_entries(META_Q=hl.map(
        lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2),
        mt.META_BETA),
                             variant_exists=hl.map(lambda x: ~hl.is_missing(x),
                                                   mt.summary_stats.BETA))
    mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out(
        mt.variant_exists, mt.pheno_data.pop))
    mt = mt.annotate_entries(META_Pvalue_het=hl.map(
        lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1),
        hl.range(hl.len(mt.META_Q))))

    # Add other annotations
    mt = mt.annotate_entries(
        ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats),
        ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats),
        META_AC_Allele2=all_and_leave_one_out(
            mt.summary_stats.AF_Allele2 * mt.summary_stats.N,
            mt.pheno_data.pop),
        META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop))
    mt = mt.annotate_entries(
        META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N,
        META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) /
        mt.META_N,
        META_AF_Controls=all_and_leave_one_out(mt.ac_controls,
                                               mt.pheno_data.pop) / mt.META_N)
    mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases',
                 'ac_controls', 'summary_stats', 'META_AC_Allele2')

    # Format everything into array<struct>
    def is_finite_or_missing(x):
        return (hl.or_missing(hl.is_finite(x), x))

    meta_fields = [
        'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2',
        'AF_Cases', 'AF_Controls'
    ]
    mt = mt.transmute_entries(meta_analysis=hl.map(
        lambda i: hl.struct(
            **{
                field: is_finite_or_missing(mt[f'META_{field}'][i])
                for field in meta_fields
            }), hl.range(hl.len(mt.META_BETA))))

    col_fields = ['n_cases', 'n_controls']
    mt = mt.annotate_cols(
        **{
            field: all_and_leave_one_out(mt.pheno_data[field],
                                         mt.pheno_data.pop)
            for field in col_fields
        })
    col_fields += ['pop']
    mt = mt.annotate_cols(pop=all_and_leave_one_out(
        mt.pheno_data.pop,
        mt.pheno_data.pop,
        all_f=lambda x: x,
        loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x),
    ))
    mt = mt.transmute_cols(meta_analysis_data=hl.map(
        lambda i: hl.struct(**{field: mt[field][i]
                               for field in col_fields}),
        hl.range(hl.len(mt.pop))))

    mt.describe()
    mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite)

    hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')