def pre_process_subset_freq(subset: str,
                            global_ht: hl.Table,
                            test: bool = False) -> hl.Table:
    """
    Prepare subset frequency Table by filling in missing frequency fields for loci present only in the global cohort.

    .. note::

        The resulting final `freq` array will be as long as the subset `freq_meta` global (i.e., one `freq` entry for each `freq_meta` entry)

    :param subset: subset ID
    :param global_ht: Hail Table containing all variants discovered in the overall release cohort
    :param test: If True, filter to small region on chr20
    :return: Table containing subset frequencies with missing freq structs filled in
    """

    # Read in subset HTs
    subset_ht_path = get_freq(subset=subset).path
    subset_chr20_ht_path = qc_temp_prefix() + f"chr20_test_freq.{subset}.ht"

    if test:
        if file_exists(subset_chr20_ht_path):
            logger.info(
                "Loading chr20 %s subset frequency data for testing: %s",
                subset,
                subset_chr20_ht_path,
            )
            subset_ht = hl.read_table(subset_chr20_ht_path)

        elif file_exists(subset_ht_path):
            logger.info(
                "Loading %s subset frequency data for testing: %s",
                subset,
                subset_ht_path,
            )
            subset_ht = hl.read_table(subset_ht_path)
            subset_ht = hl.filter_intervals(
                subset_ht, [hl.parse_locus_interval("chr20:1-1000000")])

    elif file_exists(subset_ht_path):
        logger.info("Loading %s subset frequency data: %s", subset,
                    subset_ht_path)
        subset_ht = hl.read_table(subset_ht_path)

    else:
        raise DataException(
            f"Hail Table containing {subset} subset frequencies not found. You may need to run the script generate_freq_data.py to generate frequency annotations first."
        )

    # Fill in missing freq structs
    ht = subset_ht.join(global_ht.select().select_globals(), how="right")
    ht = ht.annotate(freq=hl.if_else(
        hl.is_missing(ht.freq),
        hl.map(lambda x: missing_callstats_expr(),
               hl.range(hl.len(ht.freq_meta))),
        ht.freq,
    ))

    return ht
def get_rf_runs(rf_json_fp: str) -> Dict:
    """
    Loads RF run data from JSON file.
    :param rf_json_fp: File path to rf json file.
    :return: Dictionary containing the content of the JSON file, or an empty dictionary if the file wasn't found.
    """
    if file_exists(rf_json_fp):
        with hl.hadoop_open(rf_json_fp) as f:
            return json.load(f)
    else:
        logger.warning(
            f"File {rf_json_fp} could not be found. Returning empty RF run hash dict."
        )
        return {}
Esempio n. 3
0
def get_release_file(file_path: str, version: str = CURRENT_RELEASE) -> str:
    """
    Tries to get the desired file from the corresponding release version on the google cloud.
    If the file is not found for the desired release version, falls back on previous versions.

    :param  str file_path: Desired file path, with {0} as placeholder(s) for the version number
    :param str version: Desired file version
    :return: Path for closest version of the file available
    """
    if file_exists(file_path.format(version)):
        return file_path.format(version)
    else:
        for v in range(RELEASES.index(version) - 1, -1, -1):
            if file_exists(file_path.format(RELEASES[v])):
                print(
                    "WARN: Resource {} could not be found for gnomAD release version {}.\n Loading gnomAD release version {} of the file. ({})"
                    .format(file_path.format(version), version, RELEASES[v],
                            file_path.format(RELEASES[v])))
                return file_path.format(RELEASES[v])

        print("ERROR: Resource {} could not be found for any release.".format(
            file_path.format(version)))
        return file_path.format(version)
Esempio n. 4
0
def main(args):
    hl.init(default_reference="GRCh38", log="/variant_histograms.log")

    logger.info("Loading ANNOTATIONS_HISTS dictionary...")
    if not file_exists(annotation_hists_path()):
        raise DataException(
            "Annotation hists JSON file not found. Need to create this JSON before running script!"
        )

    with hl.hadoop_open(annotation_hists_path()) as a:
        ANNOTATIONS_HISTS = json.loads(a.read())

    # NOTE: histogram aggregations on these metrics are done on the entire callset (not just PASS variants), on raw data
    ht = hl.read_table(release_ht_path(public=False))
    ht = ht.select(freq=ht.freq, info=ht.info.select(*ANNOTATIONS_HISTS))

    inbreeding_bin_ranges = ANNOTATIONS_HISTS["InbreedingCoeff"]

    # Remove InbreedingCoeff from ANNOTATIONS_HISTS. It requires different ranges by allele frequency and needs to be
    # handled differently. It is stored as a dictionary in annotation_hists_path
    ANNOTATIONS_HISTS.remove("InbreedingCoeff")

    logger.info("Getting info annotation histograms...")
    hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS, LOG10_ANNOTATIONS)

    # Evaluate minimum and maximum values for each metric of interest to help determine the bounds of the hists
    # NOTE: Run this first, then update values in annotation_hists_path JSON as necessary
    if args.determine_bounds:
        logger.info(
            "Evaluating minimum and maximum values for each metric of interest. Maximum values capped at 1e10."
        )
        minmax_dict = {}
        for metric in ANNOTATIONS_HISTS:
            minmax_dict[metric] = hl.struct(
                min=hl.agg.min(ht.info[metric]),
                max=hl.if_else(
                    hl.agg.max(ht.info[metric]) < 1e10,
                    hl.agg.max(ht.info[metric]),
                    1e10,
                ),
            )
        minmax = ht.aggregate(hl.struct(**minmax_dict))
        logger.info(f"Metrics bounds: {minmax}")
    else:
        logger.info(
            "Aggregating hists over ranges defined in the annotation_hists_path JSON file. --determine_bounds can "
            "be used to help define these ranges..."
        )
        hists = ht.aggregate(
            hl.array(
                [
                    hist_expr.annotate(metric=hist_metric)
                    for hist_metric, hist_expr in hist_ranges_expr.items()
                ]
            )
            .extend(
                hl.array(
                    hl.agg.group_by(
                        create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF),
                        hl.agg.hist(
                            hl.log10(ht.info.QUALapprox),
                            *ANNOTATIONS_HISTS["QUALapprox"],
                        ),
                    )
                ).map(lambda x: x[1].annotate(metric="QUALapprox-" + x[0]))
            )
            .extend(
                hl.array(
                    hl.agg.group_by(
                        create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF),
                        hl.agg.hist(
                            hl.log10(ht.info.AS_QUALapprox),
                            *ANNOTATIONS_HISTS["AS_QUALapprox"],
                        ),
                    )
                ).map(lambda x: x[1].annotate(metric="AS_QUALapprox-" + x[0]))
            ),
            _localize=False,
        )

        # Defining hist range and bins for allele frequency groups because they needed different ranges
        ht = ht.annotate(af_bin=create_frequency_bins_expr_inbreeding(AF=ht.freq[1].AF))
        inbreeding_hists = [
            ht.aggregate(
                hl.agg.filter(
                    ht.af_bin == x,
                    hl.agg.hist(ht.info.InbreedingCoeff, *inbreeding_bin_ranges[x],),
                )
            ).annotate(metric="InbreedingCoeff" + "-" + x)
            for x in inbreeding_bin_ranges
        ]

        hists = hl.eval(hl.json(hists))
        inbreeding_hists = hl.eval(hl.json(inbreeding_hists))

        # Note: The following removes '}' from the JSON stored in hists and '{' from the JSON stored in
        # inbreeding_hists then joins them together to be written out as a single JSON
        hists = hists[:-1] + "," + inbreeding_hists[1:]

        logger.info("Writing output")
        with hl.hadoop_open(qual_hists_json_path(), "w") as f:
            f.write(hists)
def main(args):
    hl.init(log="/create_release_ht.log", default_reference="GRCh38")

    # The concatenated HT contains all subset frequency annotations, plus the overall cohort frequency annotations,
    # concatenated together in a single freq annotation ('freq')

    # Load global frequency Table
    if args.test:
        global_freq_chr20_ht_path = "gs://gnomad-tmp/gnomad_freq/chr20_test_freq.ht"

        if file_exists(global_freq_chr20_ht_path):
            logger.info(
                "Loading chr20 global frequency data for testing: %s",
                global_freq_chr20_ht_path,
            )
            global_freq_ht = (hl.read_table(global_freq_chr20_ht_path).select(
                "freq").select_globals("freq_meta"))

        elif file_exists(get_freq().path):
            logger.info("Loading global frequency data for testing: %s",
                        get_freq().path)
            global_freq_ht = (hl.read_table(
                get_freq().path).select("freq").select_globals("freq_meta"))
            global_freq_ht = hl.filter_intervals(
                global_freq_ht, [hl.parse_locus_interval("chr20:1-1000000")])

    elif file_exists(get_freq().path):
        logger.info("Loading global frequency data: %s", get_freq().path)
        global_freq_ht = (hl.read_table(
            get_freq().path).select("freq").select_globals("freq_meta"))

    else:
        raise DataException(
            "Hail Table containing global callset frequencies not found. You may need to run the script to generate frequency annotations first."
        )

    # Load subset frequency Table(s)
    if args.test:
        test_subsets = args.test_subsets
        subset_freq_hts = [
            pre_process_subset_freq(subset, global_freq_ht, test=True)
            for subset in test_subsets
        ]

    else:
        subset_freq_hts = [
            pre_process_subset_freq(subset, global_freq_ht)
            for subset in SUBSETS
        ]

    logger.info("Concatenating subset frequencies...")
    freq_ht = hl.Table.multi_way_zip_join(
        [global_freq_ht] + subset_freq_hts,
        data_field_name="freq",
        global_field_name="freq_meta",
    )
    freq_ht = freq_ht.transmute(freq=freq_ht.freq.flatmap(lambda x: x.freq))
    freq_ht = freq_ht.transmute_globals(
        freq_meta=freq_ht.freq_meta.flatmap(lambda x: x.freq_meta))

    # Create frequency index dictionary on concatenated array (i.e., including all subsets)
    # NOTE: non-standard downsampling values are created in the frequency script corresponding to population totals, so
    # callset-specific DOWNSAMPLINGS must be used instead of the generic DOWNSAMPLING values
    global_freq_ht = hl.read_table(get_freq().path)
    freq_ht = freq_ht.annotate_globals(freq_index_dict=make_freq_index_dict(
        freq_meta=hl.eval(freq_ht.freq_meta),
        pops=POPS,
        downsamplings=hl.eval(global_freq_ht.downsamplings),
    ))

    # Add back in all global frequency annotations not present in concatenated frequencies HT
    row_fields = global_freq_ht.row_value.keys() - freq_ht.row_value.keys()
    logger.info(
        "Adding back the following row annotations onto concatenated frequencies: %s",
        row_fields)
    freq_ht = freq_ht.annotate(**global_freq_ht[freq_ht.key].select(
        *row_fields))

    global_fields = global_freq_ht.globals.keys() - freq_ht.globals.keys()
    global_fields.remove("downsamplings")
    logger.info(
        "Adding back the following global annotations onto concatenated frequencies: %s",
        global_fields)
    freq_ht = freq_ht.annotate_globals(**global_freq_ht.index_globals().select(
        *global_fields))

    logger.info("Preparing release Table annotations...")
    ht = add_release_annotations(freq_ht)

    logger.info("Removing chrM and sites without filter...")
    ht = hl.filter_intervals(ht, [hl.parse_locus_interval("chrM")], keep=False)
    ht = ht.filter(hl.is_defined(ht.filters))

    ht = ht.checkpoint(
        qc_temp_prefix() + "release/gnomad.genomes.v3.1.sites.chr20.ht"
        if args.test else release_sites().path,
        args.overwrite,
    )
    logger.info("Final variant count: %d", ht.count())
    ht.describe()
    ht.show()
    ht.summarize()
Esempio n. 6
0
def main(args):
    hl.init(log="/variant_qc_finalize.log")

    ht = get_score_bins(args.model_id, aggregated=False).ht()
    if args.filter_centromere_telomere:
        ht = ht.filter(
            ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))

    info_ht = get_info(split=True).ht()
    ht = ht.filter(~info_ht[ht.key].AS_lowqual)

    if args.model_id.startswith("vqsr_"):
        ht = ht.drop("info")

    freq_ht = get_freq().ht()
    ht = ht.annotate(InbreedingCoeff=freq_ht[ht.key].InbreedingCoeff)
    freq_idx = freq_ht[ht.key]
    aggregated_bin_path = get_score_bins(args.model_id, aggregated=True).path
    if not file_exists(aggregated_bin_path):
        sys.exit(
            f"Could not find binned HT for model: {args.model_id} ({aggregated_bin_path}). Please run create_ranked_scores.py for that hash."
        )
    aggregated_bin_ht = get_score_bins(args.model_id, aggregated=True).ht()

    ht = generate_final_filter_ht(
        ht,
        args.model_name,
        args.score_name,
        ac0_filter_expr=freq_idx.freq[0].AC == 0,
        ts_ac_filter_expr=freq_idx.freq[1].AC == 1,
        mono_allelic_flag_expr=(freq_idx.freq[1].AF == 1) |
        (freq_idx.freq[1].AF == 0),
        snp_bin_cutoff=args.snp_bin_cutoff,
        indel_bin_cutoff=args.indel_bin_cutoff,
        snp_score_cutoff=args.snp_score_cutoff,
        indel_score_cutoff=args.indel_score_cutoff,
        inbreeding_coeff_cutoff=args.inbreeding_coeff_threshold,
        aggregated_bin_ht=aggregated_bin_ht,
        bin_id="bin",
        vqsr_ht=get_vqsr_filters(args.vqsr_model_id, split=True).ht()
        if args.vqsr_model_id else None,
    )
    ht = ht.annotate_globals(
        filtering_model=ht.filtering_model.annotate(model_id=args.model_id, ))
    if args.model_id.startswith("vqsr_"):
        ht = ht.annotate_globals(filtering_model=ht.filtering_model.annotate(
            snv_training_variables=[
                "AS_QD",
                "AS_MQRankSum",
                "AS_ReadPosRankSum",
                "AS_FS",
                "AS_SOR",
                "AS_MQ",
            ],
            indel_training_variables=[
                "AS_QD",
                "AS_MQRankSum",
                "AS_ReadPosRankSum",
                "AS_FS",
                "AS_SOR",
            ],
        ))
    else:
        ht = ht.annotate_globals(filtering_model=ht.filtering_model.annotate(
            snv_training_variables=ht.features,
            indel_training_variables=ht.features,
        ))

    ht.write(final_filter.path, args.overwrite)

    final_filter_ht = final_filter.ht()
    final_filter_ht.summarize()
Esempio n. 7
0
def main(args):
    output_dir = args.output_dir
    output_name = args.output_name
    inferred_sex = args.inferred_sex
    mt_path = args.mt_path
    input_pedigree = args.input_pedigree

    gnomad_ld = args.gnomad_ld
    run_ibd = args.run_ibd
    first_degree_pi_hat = args.first_degree_pi_hat
    grandparent_pi_hat = args.grandparent_pi_hat
    grandparent_ibd1 = args.grandparent_ibd1
    grandparent_ibd2 = args.grandparent_ibd2
    filter_kinship_ht = args.filter_kinship_ht

    logger.info("Reading in inputs...")
    mt = hl.read_matrix_table(mt_path)
    pedigree = hl.import_table(input_pedigree, impute=True)

    # Infer build of the MatrixTable
    build = get_reference_genome(mt.locus).name

    logger.info(
        "Filtering to biallelic SNVs on autosomes and performing LD pruning..."
    )
    mt = filter_rows_for_qc(mt,
                            min_af=0.001,
                            min_callrate=0.99,
                            apply_hard_filters=False)
    mt = ld_prune(mt, build, gnomad_ld)
    out_mt = f"{output_dir}/{output_name}_processed_mt.mt"

    logger.info("Remapping sample names...")
    mt, sex_ht = remap_samples(mt_path, mt, pedigree, inferred_sex)

    mt = mt.checkpoint(out_mt, overwrite=True)

    if run_ibd:
        logger.info("Running identity by descent...")
        ibd_results_ht = hl.identity_by_descent(mt,
                                                maf=mt.AF,
                                                min=0.10,
                                                max=1.0)
        ibd_results_ht = ibd_results_ht.annotate(
            ibd0=ibd_results_ht.ibd.Z0,
            ibd1=ibd_results_ht.ibd.Z1,
            ibd2=ibd_results_ht.ibd.Z2,
            pi_hat=ibd_results_ht.ibd.PI_HAT,
        ).drop("ibs0", "ibs1", "ibs2", "ibd")
        out_ht = f"{output_dir}/{output_name}_ibd_kinship.tsv"
        ibd_results_ht.export(out_ht)

    else:
        logger.warn("Skipping IBD - using previous calculations...")
        if not file_exists(f"{output_dir}/{output_name}_ibd_kinship.tsv"):
            logger.warning(
                "IBD calculation was skipped but no file with previous calculations was found...",
                sample,
            )

    logger.info("Reading in kinship ht...")
    kin_ht = hl.import_table(f"{output_dir}/{output_name}_ibd_kinship.tsv",
                             impute=True)

    # Subset MatrixTable and sex ht to the samples in the pedigree
    mt_subset, sex_ht, expected_samples, vcf_samples = subset_samples(
        mt, pedigree, sex_ht, output_dir, output_name)

    # Subset Table to the samples in the pedigree
    subset = hl.set(expected_samples)
    kin_ht = kin_ht.filter(
        subset.contains(kin_ht.i) | subset.contains(kin_ht.j))

    # Key the Table
    kin_ht = kin_ht.key_by("i", "j")

    # Setup output file
    out_summary = hl.hadoop_open(
        f"{output_dir}/{output_name}_ped_check_summary.txt", "w")

    if filter_kinship_ht:
        logger.info(
            "Filtering kinship table to remove unrelated individuals from analysis..."
        )
        kin_ht = filter_kin_ht(kin_ht, out_summary)

    # Output basic stats
    out_summary.write("Number individuals in pedigree: " +
                      str(len(expected_samples)) + "\n")
    out_summary.write("Number individuals in subset from the VCF: " +
                      str(len(vcf_samples)) + "\n")
    out_summary.write("Number of relationships in the kinship table: " +
                      str(kin_ht.count()) + "\n\n")
    out_summary.close()

    seqr_projects, family_ids, given_sex = write_functional_pedigree(
        input_pedigree, vcf_samples, output_dir, output_name)

    # Compare inferred and given sex
    check_sex(sex_ht, output_dir, output_name)

    kin_ht = add_project_and_family_annotations(kin_ht, seqr_projects,
                                                family_ids)

    logger.info("Writing kinship ht per project...")
    # Output original ht per project
    for project in set(seqr_projects.values()):
        full_ht = kin_ht.filter((kin_ht.seqr_proj_i == project)
                                | (kin_ht.seqr_proj_j == project))
        full_ht.drop("seqr_proj_i", "seqr_proj_j").export(
            f"{output_dir}/{project}/{output_name}_{project}_annotated_kin.txt"
        )