def annotate_sex(mt: hl.MatrixTable,
                 out_internal_mt_prefix: str,
                 male_threshold: float = 0.8,
                 female_threshold: float = 0.5) -> hl.MatrixTable:
    """
    Imputes sex, exports data, and annotates mt with this data
    NOTE: Evaluated in R (plots) and decided on cutoff of F<0.5 for females and F>0.8 for males (default) for genomes

    :param MatrixTable mt: MT containing samples to be ascertained for sex
    :param str out_internal_mt_prefix: file path prefix for tsv containing samples and sex imputation annotations
    :return: MatrixTable with imputed sex annotations stashed in column annotation 'sex_check'
    :rtype: MatrixTable
    """
    mt1 = hl.filter_intervals(mt, [hl.parse_locus_interval('chrX')])
    #mt = mt.filter_rows(mt.locus.in_x_nonpar())
    mtx_unphased = mt1.select_entries(
        GT=hl.unphased_diploid_gt_index_call(mt1.GT.n_alt_alleles()))
    #imputed_sex = hl.impute_sex(mtx_unphased.GT)
    sex_ht = hl.impute_sex(mtx_unphased.GT,
                           aaf_threshold=0.05,
                           female_threshold=female_threshold,
                           male_threshold=male_threshold)
    sex_ht.export(out_internal_mt_prefix + '.sex_check.txt.bgz')
    sex_colnames = ['f_stat', 'is_female']
    sex_ht = sex_ht.select(*sex_colnames)
    mt = mt.annotate_cols(**sex_ht[mt.col_key])
    return mt
Esempio n. 2
0
def annotate_sex(mt: hl.MatrixTable,
                 male_threshold: float = 0.6,
                 female_threshold: float = 0.4) -> hl.MatrixTable:
    """
    Imputes sex, exports data, and annotates mt with this data
    NOTE:
    :param female_threshold:
    :param male_threshold:
    :param MatrixTable mt: MT containing samples to be ascertained for sex
able
    """
    # unphase MT
    mt = unphase_mt(mt)

    # impute data
    sex_ht = hl.impute_sex(mt.GT,
                           aaf_threshold=0.05,
                           female_threshold=female_threshold,
                           male_threshold=male_threshold,
                           include_par=False)

    sex_colnames = ['f_stat', 'is_female']
    sex_ht = sex_ht.select(*sex_colnames)
    mt = mt.annotate_cols(**sex_ht[mt.col_key])
    return mt
Esempio n. 3
0
    def test_impute_sex_same_as_plink(self):
        import subprocess as sp

        ds = hl.import_vcf(resource('x-chromosome.vcf'))

        sex = hl.impute_sex(ds.GT, include_par=True)

        vcf_file = utils.uri_path(utils.new_temp_file(prefix="plink", suffix="vcf"))
        out_file = utils.uri_path(utils.new_temp_file(prefix="plink"))

        hl.export_vcf(ds, vcf_file)

        try:
            out = sp.check_output(
                ["plink", "--vcf", vcf_file, "--const-fid", "--check-sex",
                 "--silent", "--out", out_file],
                stderr=sp.STDOUT)
        except sp.CalledProcessError as e:
            print(e.output)
            raise e

        plink_sex = hl.import_table(out_file + '.sexcheck',
                                    delimiter=' +',
                                    types={'SNPSEX': hl.tint32,
                                           'F': hl.tfloat64})
        plink_sex = plink_sex.select('IID', 'SNPSEX', 'F')
        plink_sex = plink_sex.select(
            s=plink_sex.IID,
            is_female=hl.cond(plink_sex.SNPSEX == 2,
                              True,
                              hl.cond(plink_sex.SNPSEX == 1,
                                      False,
                                      hl.null(hl.tbool))),
            f_stat=plink_sex.F).key_by('s')

        sex = sex.select(s=sex.s,
                         is_female=sex.is_female,
                         f_stat=sex.f_stat)

        self.assertTrue(plink_sex._same(sex.select_globals(), tolerance=1e-3))

        ds = ds.annotate_rows(aaf=(agg.call_stats(ds.GT, ds.alleles)).AF[1])

        self.assertTrue(hl.impute_sex(ds.GT)._same(hl.impute_sex(ds.GT, aaf='aaf')))
Esempio n. 4
0
def check_sex(mt):
    '''
    Conducts sex imputation statistics for a site. Returns an mt with an annotated
    imputed sex column & a column which flags those who failed sex filter as True
    :param mt: hail matrix table which contains a reported sex column of 'F', 'M', or 'U' named "reported_sex"
    :return: hail matrix table with a new column named sex_filter containing the sex discrepancy filter flag
    '''
    new_mt = hl.impute_sex(mt.GT)
    mt = mt.annotate_cols(imputedSex=new_mt[mt.s])
    return mt.annotate_cols(
        sex_filter=mt.imputedSex.is_female != (mt.reported_sex == 'F'))
Esempio n. 5
0
def impute_sex(mt):
    vcf_samples = mt.s.collect()
    imputed_sex = hl.impute_sex(mt.GT).collect()

    #for sample, imputed_sex_struct in zip(vcf_samples, imputed_sex):
    #    print(f"{sample}   {'F' if imputed_sex_struct.is_female else 'M'}  {imputed_sex_struct.f_stat:0.3f}   {imputed_sex_struct.observed_homs/imputed_sex_struct.expected_homs:0.2f}")

    is_female_dict = {
        sample: imputed_sex_struct.is_female
        for sample, imputed_sex_struct in zip(vcf_samples, imputed_sex)
    }
    return is_female_dict
Esempio n. 6
0
def impute_sex_plot(mt, args, mt_to_annotate=None):
    """
    Impute sex of individuals and plot resultant f stat values
    :param mt: maf pruned matrix table to caculate f stat values
    :param mt_to_annotate: matrix table to add sex information to
    :return: returns either annotated matrix table and imputed sex Hail table, if mt_to_annotate is not None,
    or else just the imputed sex Hail table.
    """
    datestr = time.strftime("%Y.%m.%d")
    imputed_sex = hl.impute_sex(mt.GT,
                                female_threshold=args.female_threshold,
                                male_threshold=args.male_threshold)

    sex_count = imputed_sex.aggregate(hl.agg.counter(imputed_sex.is_female))

    logging.info(f'Imputed sex count: {sex_count}')

    fstat_stats = imputed_sex.aggregate(hl.agg.stats(imputed_sex.f_stat))
    fstat_hist = imputed_sex.aggregate(
        hl.agg.hist(imputed_sex.f_stat, fstat_stats.min, fstat_stats.max, 50))

    output_file(f"{datestr}_imputed_sex_fstat_hist.html")
    p = hl.plot.histogram(fstat_hist,
                          legend='F stat',
                          title='F stat histogram')
    save(p)

    if mt_to_annotate is not None:
        mt_to_annotate = mt_to_annotate.annotate_cols(
            is_female_imputed=imputed_sex[mt_to_annotate.s].is_female,
            f_stat=imputed_sex[mt_to_annotate.s].f_stat)
        mt_to_annotate = mt_to_annotate.annotate_globals(
            sex_imputation_thresholds={
                'female_threshold': args.female_threshold,
                'male_threshold': args.male_threshold
            })

        mt = mt.annotate_cols(is_female_imputed=imputed_sex[mt.s].is_female)
        mt = mt.annotate_globals(
            sex_imputation_thresholds={
                'female_threshold': args.female_threshold,
                'male_threshold': args.male_threshold
            })
        args.sex_col = "is_female_imputed"
        args.male_tag = False
        args.female_tag = True

        return mt, imputed_sex, mt_to_annotate
    else:
        return mt, imputed_sex
Esempio n. 7
0
def sex_violations(mt, input_type):
    # step 4
    imputed_sex = hl.impute_sex(mt.GT)
    if input_type == "plink":
        # Verify that when sex info is missing value is set to None
        sex_exclude = mt.filter_cols(
            (mt.is_female != imputed_sex[mt.s].is_female)
            & (mt.is_female is not None)).s.collect()
    else:
        # Verify that when meta file is read in, column formatting is kept
        sex_exclude = mt.filter_cols(
            (mt.annotations.Sex != imputed_sex[mt.s].is_female)
            & (mt.annotations.Sex is not None)).s.collect()

    if len(sex_exclude) > 0:
        mt = mt.filter_cols(hl.literal(sex_exclude).contains(mt['s']),
                            keep=False)

    results = {'sex_excluded': len(sex_exclude)}

    return mt, results
Esempio n. 8
0
def filter_sex_check(mt, fhet_y, fhet_x):
    # step 3
    imputed_sex = hl.impute_sex(mt.GT)
    f_stat_out = mt.filter_cols(
        ((imputed_sex[mt.s].f_stat < fhet_x) & (mt.is_female == False) |
         (imputed_sex[mt.s].f_stat > fhet_y) &
         (mt.is_female == True))).s.collect()
    if len(f_stat_out) > 0:
        mt = mt.filter_cols(hl.literal(f_stat_out).contains(mt['s']),
                            keep=False)

    from .test_plots import fstat_plt
    import pandas as pd
    sex_check_plot = fstat_plt(imputed_sex, fhet_y, fhet_x)
    sex_check_table = pd.DataFrame(f_stat_out, columns=['SampleID'])

    results = {
        'sex_check_removed': len(f_stat_out),
        'sex_check_plot': sex_check_plot,
        'sex_check_table': sex_check_table
    }

    return mt, results
Esempio n. 9
0
def annotate_sex(
    mtds: Union[hl.MatrixTable, hl.vds.VariantDataset],
    is_sparse: bool = True,
    excluded_intervals: Optional[hl.Table] = None,
    included_intervals: Optional[hl.Table] = None,
    normalization_contig: str = "chr20",
    reference_genome: str = "GRCh38",
    sites_ht: Optional[hl.Table] = None,
    aaf_expr: Optional[str] = None,
    gt_expr: str = "GT",
    f_stat_cutoff: float = 0.5,
    aaf_threshold: float = 0.001,
) -> hl.Table:
    """
    Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.

    Return Table with the following fields:
        - s (str): Sample
        - chr20_mean_dp (float32): Sample's mean coverage over chromosome 20.
        - chrX_mean_dp (float32): Sample's mean coverage over chromosome X.
        - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y.
        - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X.
        - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y.
        - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex.
        - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex.
        - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - X_karyotype (str): Sample's chromosome X karyotype.
        - Y_karyotype (str): Sample's chromosome Y karyotype.
        - sex_karyotype (str): Sample's sex karyotype.

    :param mtds: Input MatrixTable or VariantDataset
    :param bool is_sparse: Whether input MatrixTable is in sparse data format
    :param excluded_intervals: Optional table of intervals to exclude from the computation.
    :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes.
    :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies.
    :param reference_genome: Reference genome used for constructing interval list. Default: 'GRCh38'
    :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex,
                    and pulls alternate allele frequency from this Table.
    :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency.
    :param gt_expr: Name of entry field storing the genotype. Default: 'GT'
    :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff.
    :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations.
    :return: Table of samples and their imputed sex karyotypes.
    """
    logger.info("Imputing sex chromosome ploidies...")

    is_vds = isinstance(mtds, hl.vds.VariantDataset)
    if is_vds:
        if excluded_intervals is not None:
            raise NotImplementedError(
                "excluded_intervals is not used when imputing sex chromosome ploidy for VDS"
            )
        ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
            mtds,
            calling_intervals=included_intervals,
            normalization_contig=normalization_contig,
        )
        ploidy_ht = ploidy_ht.rename(
            {"x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy"}
        )
        mt = mtds.variant_data
    else:
        mt = mtds
        if is_sparse:
            ploidy_ht = impute_sex_ploidy(
                mt, excluded_intervals, included_intervals, normalization_contig
            )
        else:
            raise NotImplementedError(
                "Imputing sex ploidy does not exist yet for dense data."
            )

    x_contigs = get_reference_genome(mt.locus).x_contigs
    logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs)
    if "was_split" in list(mt.row):
        mt = mt.filter_rows((~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1]))
    else:
        mt = mt.filter_rows(
            (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1])
        )

    mt = hl.filter_intervals(
        mt,
        [
            hl.parse_locus_interval(contig, reference_genome=reference_genome)
            for contig in x_contigs
        ],
        keep=True,
    )

    if sites_ht is not None:
        if aaf_expr == None:
            logger.warning(
                "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'."
            )
            aaf_expr = "AF"
        logger.info("Filtering to provided sites")
        mt = mt.annotate_rows(**sites_ht[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt[aaf_expr]))

    logger.info("Calculating inbreeding coefficient on chrX")
    sex_ht = hl.impute_sex(
        mt[gt_expr],
        aaf_threshold=aaf_threshold,
        male_threshold=f_stat_cutoff,
        female_threshold=f_stat_cutoff,
        aaf=aaf_expr,
    )

    logger.info("Annotating sex ht with sex chromosome ploidies")
    sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key])

    logger.info("Inferring sex karyotypes")
    x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(sex_ht, f_stat_cutoff)
    sex_ht = sex_ht.annotate_globals(
        x_ploidy_cutoffs=hl.struct(
            upper_cutoff_X=x_ploidy_cutoffs[0],
            lower_cutoff_XX=x_ploidy_cutoffs[1][0],
            upper_cutoff_XX=x_ploidy_cutoffs[1][1],
            lower_cutoff_XXX=x_ploidy_cutoffs[2],
        ),
        y_ploidy_cutoffs=hl.struct(
            lower_cutoff_Y=y_ploidy_cutoffs[0][0],
            upper_cutoff_Y=y_ploidy_cutoffs[0][1],
            lower_cutoff_YY=y_ploidy_cutoffs[1],
        ),
        f_stat_cutoff=f_stat_cutoff,
    )
    return sex_ht.annotate(
        **get_sex_expr(
            sex_ht.chrX_ploidy, sex_ht.chrY_ploidy, x_ploidy_cutoffs, y_ploidy_cutoffs
        )
    )
Esempio n. 10
0
    **hl.parse_variant(ht_pruned_chrx_variants.f0, reference_genome='GRCh38'))
ht_pruned_chrx_variants = ht_pruned_chrx_variants.key_by(
    ht_pruned_chrx_variants.locus, ht_pruned_chrx_variants.alleles)

mt = hl.read_matrix_table(MT_HARDCALLS)
mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key]))
mt = mt.filter_rows(hl.is_defined(ht_pruned_chrx_variants[mt.row_key]))

n = mt.count()

print('n samples:')
print(n[1])
print('n variants:')
print(n[0])

imputed_sex = hl.impute_sex(mt.GT, female_threshold=0.6, male_threshold=0.6)
mt = mt.annotate_cols(phenotype=sample_annotations[mt.s])
mt = mt.annotate_cols(impute_sex=imputed_sex[mt.s])

mt.cols().select('impute_sex', 'phenotype').flatten().export(IMPUTESEX_FILE)
# Want to change this to reflect the dataset that I have.
mt.cols().write(IMPUTESEX_TABLE, overwrite=True)

# Determine non-missing allele count on the y.
mt = hl.read_matrix_table(MT_HARDCALLS)
mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key]))
mt = mt.filter_rows(mt.locus.in_y_nonpar() | mt.locus.in_y_par())
mt = hl.sample_qc(mt, name='qc')

mt_cols = mt.cols()
mt_cols.select(n_called=mt_cols.qc.n_called).export(Y_NCALLED)
Esempio n. 11
0
pprint(a)
mt_AF = mt.filter_rows(mt.variant_qc.AF[1] >= 0.01)

######## 3. QUALITY CONTROL SAMPLES
######## 3.1 Filter samples for outliers more than (6 * SD) from mean (Part 1)
# Calculate sample statistics
mt = hl.sample_qc(mt)
# Calculate statistics on sample statistics
stats_singleton = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_singleton))
stats_ti_tv = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_ti_tv))
stats_het_hom_var = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_het_hom_var))
stats_het = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_het))

######## 3.2 Sex check on chromosome X (inbreeding coefficient)
# Determine sex from GT calls in sex chromosomes
t = hl.impute_sex(mt.GT)
# Only keep those where genetic sex matches self-reported Sex
mt = mt.filter_cols(t[mt.s].is_female == mt.is_female)

######## 3.3 Check for genetic relationship / "duplicates"
# Calculate identity-by-descent matrix
mt_relatedness = hl.identity_by_descent(mt)
# keep pairs of samples with PI_HAT in [0.2, 1] using MAF computed from the dataset itself in row field panel_maf.
t_ibd = relatedness.filter(relatedness.ibd.PI_HAT > 0.2)
t_ibd.key_by('i')
mt.key_cols_by("s")
#Collect the IDs of the related samples in t_ibd
ibd_idx = t_ibd.aggregate(hl.agg.collect_as_set(t_ibd.i))
mt_ibd = mt.filter_cols(hl.is_defined(ibd_idx))

######### 3.3 Filter samples for outliers more than (6 * SD) from mean (Part 2)
Esempio n. 12
0
#vds5 = hl.read_matrix_table(vds_common_file)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# sex imputation
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("sex imputation...")
vdsnopar = vds5.filter_rows(hl.is_defined(par[vds5.locus]), keep=False)
vdsnopar = vdsnopar.annotate_cols(
    ydp=hl.agg.count_where((vdsnopar.locus.contig == 'chrY')
                           & (hl.is_defined(vdsnopar.GT))))

vdsx = vdsnopar.filter_rows((vdsnopar.locus.contig == "chrX")
                            & (vdsnopar.variant_qc.AF >= 0.05)
                            & (vdsnopar.variant_qc.AF <= 0.95))
ct = hl.impute_sex(vdsx.GT, female_threshold=0.6, male_threshold=0.7)
vdsct = vdsnopar.cols()
ct = ct.annotate(ydp=vdsct[ct.s].ydp)

(ct.select(ID=ct.s, sexFstat=ct.f_stat, isFemale=ct.is_female,
           ydp=ct.ydp).export(sample_sex_fstat_file))

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ld pruning
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("LD pruning...")
vds5_ldp = hl.ld_prune(vds5, n_cores=1600, r2=0.1)
#vds5_ldp = hl.ld_prune(vds5, n_cores=60, r2=0.2, window=1000000, memory_per_core=512)

print("writing LD pruned VDS...")
Esempio n. 13
0
def checkSex(mt):
    new_mt = hl.impute_sex(mt.GT)
    mt = mt.annotate_cols(imputedSex=new_mt[mt.s])
    return (mt.annotate_cols(
        sex_filter=mt.imputedSex.is_female != (mt.reported_sex == 'F')))
Esempio n. 14
0
def annotate_sex(
    mtds: Union[hl.MatrixTable, hl.vds.VariantDataset],
    is_sparse: bool = True,
    excluded_intervals: Optional[hl.Table] = None,
    included_intervals: Optional[hl.Table] = None,
    normalization_contig: str = "chr20",
    sites_ht: Optional[hl.Table] = None,
    aaf_expr: Optional[str] = None,
    gt_expr: str = "GT",
    f_stat_cutoff: float = 0.5,
    aaf_threshold: float = 0.001,
    variants_only_x_ploidy: bool = False,
    variants_only_y_ploidy: bool = False,
) -> hl.Table:
    """
    Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.

    Return Table with the following fields:
        - s (str): Sample
        - `normalization_contig`_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`.
        - chrX_mean_dp (float32): Sample's mean coverage over chromosome X.
        - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y.
        - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X.
        - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y.
        - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex.
        - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex.
        - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - X_karyotype (str): Sample's chromosome X karyotype.
        - Y_karyotype (str): Sample's chromosome Y karyotype.
        - sex_karyotype (str): Sample's sex karyotype.

    :param mtds: Input MatrixTable or VariantDataset
    :param bool is_sparse: Whether input MatrixTable is in sparse data format
    :param excluded_intervals: Optional table of intervals to exclude from the computation.
    :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes.
    :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies.
    :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex,
                    and pulls alternate allele frequency from this Table.
    :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency.
    :param gt_expr: Name of entry field storing the genotype. Default: 'GT'
    :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff.
    :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations.
    :param variants_only_x_ploidy: Whether to use depth of only variant data for the x ploidy estimation.
    :param variants_only_y_ploidy: Whether to use depth of only variant data for the y ploidy estimation.
    :return: Table of samples and their imputed sex karyotypes.
    """
    logger.info("Imputing sex chromosome ploidies...")

    is_vds = isinstance(mtds, hl.vds.VariantDataset)
    if is_vds:
        if excluded_intervals is not None:
            raise NotImplementedError(
                "The use of the parameter 'excluded_intervals' is currently not implemented for imputing sex chromosome ploidy on a VDS!"
            )
        # Begin by creating a ploidy estimate HT using the method defined by 'variants_only_x_ploidy'
        ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
            mtds,
            calling_intervals=included_intervals,
            normalization_contig=normalization_contig,
            use_variant_dataset=variants_only_x_ploidy,
        )
        ploidy_ht = ploidy_ht.rename({
            "x_ploidy":
            "chrX_ploidy",
            "y_ploidy":
            "chrY_ploidy",
            "x_mean_dp":
            "chrX_mean_dp",
            "y_mean_dp":
            "chrY_mean_dp",
            "autosomal_mean_dp":
            f"var_data_{normalization_contig}_mean_dp"
            if variants_only_x_ploidy else f"{normalization_contig}_mean_dp",
        })
        # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation using
        # the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
        if variants_only_y_ploidy != variants_only_x_ploidy:
            y_ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
                mtds,
                calling_intervals=included_intervals,
                normalization_contig=normalization_contig,
                use_variant_dataset=variants_only_y_ploidy,
            )
            y_ploidy_idx = y_ploidy_ht[ploidy_ht.key]
            ploidy_ht = ploidy_ht.annotate(
                chrY_ploidy=y_ploidy_idx.y_ploidy,
                chrY_mean_dp=y_ploidy_idx.y_mean_dp,
            )

            # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
            # that this is the variant dataset only mean DP (this will have already been added if
            # 'variants_only_x_ploidy' was also True).
            if variants_only_y_ploidy:
                ploidy_ht = ploidy_ht.annotate(
                    **{
                        f"var_data_{normalization_contig}_mean_dp":
                        y_ploidy_idx.autosomal_mean_dp
                    })

        mt = mtds.variant_data
    else:
        mt = mtds
        if is_sparse:
            ploidy_ht = impute_sex_ploidy(
                mt,
                excluded_intervals,
                included_intervals,
                normalization_contig,
                use_only_variants=variants_only_x_ploidy,
            )
            ploidy_ht = ploidy_ht.rename({
                "autosomal_mean_dp":
                f"var_data_{normalization_contig}_mean_dp" if
                variants_only_x_ploidy else f"{normalization_contig}_mean_dp",
            })
            # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation
            # using the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
            if variants_only_y_ploidy != variants_only_x_ploidy:
                y_ploidy_ht = impute_sex_ploidy(
                    mt,
                    excluded_intervals,
                    included_intervals,
                    normalization_contig,
                    use_only_variants=variants_only_y_ploidy,
                )
                y_ploidy_ht.select(
                    "chrY_ploidy",
                    "chrY_mean_dp",
                    f"{normalization_contig}_mean_dp",
                )
                # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
                # that this is the variant dataset only mean DP (this will have already been added if
                # 'variants_only_x_ploidy' was also True).
                if variants_only_y_ploidy:
                    ploidy_ht = ploidy_ht.rename({
                        f"{normalization_contig}_mean_dp":
                        f"var_data_{normalization_contig}_mean_dp"
                    })
                # Re-annotate the ploidy HT with modified Y ploidy annotations
                ploidy_ht = ploidy_ht.annotate(**y_ploidy_ht[ploidy_ht.key])

        else:
            raise NotImplementedError(
                "Imputing sex ploidy does not exist yet for dense data.")

    x_contigs = get_reference_genome(mt.locus).x_contigs
    logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs)
    if "was_split" in list(mt.row):
        mt = mt.filter_rows((~mt.was_split)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1]))
    else:
        mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1]))

    build = get_reference_genome(mt.locus).name
    mt = hl.filter_intervals(
        mt,
        [
            hl.parse_locus_interval(contig, reference_genome=build)
            for contig in x_contigs
        ],
        keep=True,
    )

    if sites_ht is not None:
        if aaf_expr == None:
            logger.warning(
                "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'."
            )
            aaf_expr = "AF"
        logger.info("Filtering to provided sites")
        mt = mt.annotate_rows(**sites_ht[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt[aaf_expr]))

    logger.info("Calculating inbreeding coefficient on chrX")
    sex_ht = hl.impute_sex(
        mt[gt_expr],
        aaf_threshold=aaf_threshold,
        male_threshold=f_stat_cutoff,
        female_threshold=f_stat_cutoff,
        aaf=aaf_expr,
    )

    logger.info("Annotating sex ht with sex chromosome ploidies")
    sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key])

    logger.info("Inferring sex karyotypes")
    x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(
        sex_ht, f_stat_cutoff)
    sex_ht = sex_ht.annotate_globals(
        x_ploidy_cutoffs=hl.struct(
            upper_cutoff_X=x_ploidy_cutoffs[0],
            lower_cutoff_XX=x_ploidy_cutoffs[1][0],
            upper_cutoff_XX=x_ploidy_cutoffs[1][1],
            lower_cutoff_XXX=x_ploidy_cutoffs[2],
        ),
        y_ploidy_cutoffs=hl.struct(
            lower_cutoff_Y=y_ploidy_cutoffs[0][0],
            upper_cutoff_Y=y_ploidy_cutoffs[0][1],
            lower_cutoff_YY=y_ploidy_cutoffs[1],
        ),
        f_stat_cutoff=f_stat_cutoff,
        variants_only_x_ploidy=variants_only_x_ploidy,
        variants_only_y_ploidy=variants_only_y_ploidy,
    )
    return sex_ht.annotate(
        **get_sex_expr(sex_ht.chrX_ploidy, sex_ht.chrY_ploidy,
                       x_ploidy_cutoffs, y_ploidy_cutoffs))
Esempio n. 15
0
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_checkpoint.mt",  overwrite=True)
    print("Finished splitting and writing mt. ")
    mt = mt_split.annotate_rows(
        Variant_Type=hl.cond((hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP",
                             hl.cond(
            hl.is_insertion(
                mt_split.alleles[0], mt_split.alleles[1]),
            "INDEL",
            hl.cond(hl.is_deletion(mt_split.alleles[0],
                                   mt_split.alleles[1]), "INDEL",
                    "Other"))))
    mt_sampleqc = hl.sample_qc(mt, name='sample_QC_Hail')
    panda_df_unfiltered_table = mt_sampleqc.cols().flatten()
    print("Sex imputation:")
    #mt2_sex = mt2.select_entries(GT=hl.unphased_diploid_gt_index_call(mt2.GT.n_alt_alleles()))
    imputed_sex = hl.impute_sex(mt_sampleqc.GT)

    # Annotate samples male or female:
    mt = mt_sampleqc.annotate_cols(sex=hl.cond(
        imputed_sex[mt_sampleqc.s].is_female, "female", "male"))

    print("Outputting table of sample qc")
    panda_df_unfiltered_table.export(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_sampleQC_unfiltered_sex_annotated.tsv.bgz", header=True)

   # mt2 = hl.variant_qc(mt_sampleqc, name='variant_QC_Hail')

    #print('Exporting variant qc pandas table to disk')
   # mt_rows = mt2.rows()
   # mt_rows.select(mt_rows.variant_QC_Hail).flatten().export(f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_variantQC_unfiltered.tsv.bgz",
    #          header=True)