Exemple #1
0
def test_GetHeterozygosity():
    afreqs = {0: 1}
    assert (utils.GetHeterozygosity(afreqs) == 0)
    afreqs = {0: 0.5, 1: 0.5}
    assert (utils.GetHeterozygosity(afreqs) == 0.5)
    afreqs = {0: 0.5, 1: 0.2, 2: 0.3}
    assert (utils.GetHeterozygosity(afreqs) == 0.62)
    afreqs = {}
    assert (np.isnan(utils.GetHeterozygosity(afreqs)))
Exemple #2
0
 def __call__(self, record):
     trrecord = trh.HarmonizeRecord(self.vcftype, record)
     het = utils.GetHeterozygosity(
         trrecord.GetAlleleFreqs(uselength=self.uselength))
     if het > self.threshold:
         return het
     return None
Exemple #3
0
def GetHet(trrecord, samplelists=[], uselength=True):
    r"""Compute heterozygosity of a locus

    Heterozygosity is defined as the probability
    that two randomly drawn allele are different.

    Parameters
    ----------
    trrecord: trh.TRRecord object
          The record that we are computing the statistic for
    samplelist: list of list of str
          List of list of the samples that we include when compute the statistic
    uselength: bool
          Whether we should collapse alleles by length

    Returns
    -------
    heterozygosity: list of float
          The heterozygosity of the locus. One value for each sample list.
          If the allele frequencies dictionary is invalid, return np.nan
    """
    if len(samplelists) == 0: samplelists.append(None)
    hetvals = []
    for sl in samplelists:
        allele_freqs = trrecord.GetAlleleFreqs(samplelist=sl,
                                               uselength=uselength)
        hetvals.append(utils.GetHeterozygosity(allele_freqs))
    return hetvals
Exemple #4
0
def main(args):
    # Load VCF file
    invcf = utils.LoadSingleReader(args.vcf, checkgz=False)
    if invcf is None:
        return 1

    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING(
            "Error: The directory which contains the output location {} does"
            " not exist".format(args.out))
        return 1

    if os.path.isdir(args.out) and args.out.endswith(os.sep):
        common.WARNING("Error: The output location {} is a "
                       "directory".format(args.out))
        return 1

    # Set up record harmonizer and infer VCF type
    vcftype = trh.InferVCFType(invcf, args.vcftype)

    # Check filters all make sense
    if not CheckFilters(invcf, args, vcftype): return 1

    # Set up locus-level filter list
    try:
        filter_list = BuildLocusFilters(args, vcftype)
    except ValueError:
        return 1
    filter_list = BuildLocusFilters(args, vcftype)
    invcf.filters = {}
    for f in filter_list:
        short_doc = f.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc)

    # Set up call-level filters
    call_filters = BuildCallFilters(args)

    # Add new FORMAT fields
    if "FILTER" not in invcf.formats:
        invcf.formats["FILTER"] = _Format("FILTER", 1, "String",
                                          "Call-level filter")

    # Add new INFO fields
    invcf.infos["AC"] = _Info("AC",
                              -1,
                              "Integer",
                              "Alternate allele counts",
                              source=None,
                              version=None)
    invcf.infos["REFAC"] = _Info("REFAC",
                                 1,
                                 "Integer",
                                 "Reference allele count",
                                 source=None,
                                 version=None)
    invcf.infos["HET"] = _Info("HET",
                               1,
                               "Float",
                               "Heterozygosity",
                               source=None,
                               version=None)
    invcf.infos["HWEP"] = _Info("HWEP",
                                1,
                                "Float",
                                "HWE p-value for obs. vs. exp het rate",
                                source=None,
                                version=None)
    invcf.infos["HRUN"] = _Info("HRUN",
                                1,
                                "Integer",
                                "Length of longest homopolymer run",
                                source=None,
                                version=None)

    # Set up output files
    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING("Output directory does not exist")
        return 1
    outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv))
    if outvcf is None: return 1

    # Set up sample info
    all_reasons = GetAllCallFilters(call_filters)
    sample_info = {}
    for s in invcf.samples:
        sample_info[s] = {"numcalls": 0, "totaldp": 0}
        for r in all_reasons:
            sample_info[s][r] = 0

    # Set up locus info
    loc_info = {"totalcalls": 0, "PASS": 0}
    for filt in filter_list:
        loc_info[filt.filter_name()] = 0

    # Go through each record
    record_counter = 0
    while True:
        try:
            record = next(invcf)
        except IndexError:
            common.WARNING(
                "Skipping TR that couldn't be parsed by PyVCF. Check VCF format"
            )
            if args.die_on_warning: return 1
        except StopIteration:
            break
        if args.verbose:
            common.MSG("Processing %s:%s" % (record.CHROM, record.POS))
        record_counter += 1
        if args.num_records is not None and record_counter > args.num_records:
            break
        # Call-level filters
        record = ApplyCallFilters(record, invcf, call_filters, sample_info)

        # Locus-level filters
        record.FILTER = None
        output_record = True
        for filt in filter_list:
            if filt(record) == None: continue
            if args.drop_filtered:
                output_record = False
                break
            record.add_filter(filt.filter_name())
            loc_info[filt.filter_name()] += 1
        if args.drop_filtered:
            if record.call_rate == 0: output_record = False
        if output_record:
            trrecord = trh.HarmonizeRecord(vcftype, record)
            # Recalculate locus-level INFO fields
            record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF)
            if record.num_called > 0:
                allele_freqs = trrecord.GetAlleleFreqs(
                    uselength=args.use_length)
                genotype_counts = trrecord.GetGenotypeCounts(
                    uselength=args.use_length)
                record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs)
                record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest(
                    allele_freqs, genotype_counts)
                record.INFO["AC"] = [
                    int(item * (3 * record.num_called)) for item in record.aaf
                ]
                record.INFO["REFAC"] = int(
                    (1 - sum(record.aaf)) * (2 * record.num_called))
            else:
                record.INFO["HET"] = -1
                record.INFO["HWEP"] = -1
                record.INFO["AC"] = [0] * len(record.ALT)
                record.INFO["REFAC"] = 0
            # Recalc filter
            if record.FILTER is None and not args.drop_filtered:
                record.FILTER = "PASS"
                loc_info["PASS"] += 1
                loc_info["totalcalls"] += record.num_called
            # Output the record
            outvcf.write_record(record)

    # Output log info
    WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab")
    WriteLocLog(loc_info, args.out + ".loclog.tab")

    return 0
Exemple #5
0
def load_strs(imputation_run_name: str,
              region: str,
              samples: np.ndarray,
              details: bool = True,
              var_subset: Optional[Set[int]] = None,
              hardcalls=False):
    """
    Iterate over a region returning genotypes at STR loci.

    First yield is a tuple of names of the fields in details.
    Every subsequent yield is described in the yields section below.

    Parameters
    ----------
    imputation_run_name:
        which imputation run to load genotypes from?
    region:
        chr:start-end
    samples:
        A boolean array of length nsamples determining which samples are included
        (True) and which are not

    Yields
    ------
    dosages: Dict[float, np.ndarray]
        A dictionary from unique length alleles to 2D arrays of size (n_samples, 2)
        which contain the dosages of those alleles for each haplotype
        Length dosage are measured in number of repeats.

        None if locus_filtered is not None
    
        If hardcalls, then instead of that just an array nx2 of length alleles
    unique_alleles: np.ndarray
        Array of unique length alleles (measured in number of repeats),
        same length as the dosages dict
    chrom: str
        e.g. '13'
    pos: int
    locus_filtered:
        None if the locus is not filtered, otherwise
        a string explaining why.
        'MAC<20' if the minor allele dosage is less than 20
        after sample subsetting, per plink's standard.

        None if hardcalls.
    locus_details:
        tuple of strings with the same length as the first yield
        with the corresponding order.

        None if hardcalls.
        

    Notes
    -----
    Hardcalls mentioned in the locus details are phased hardcalls, and in some
    corner cases will not correspond to the maximum likelihood unphased allele.
    """

    if hardcalls:
        assert var_subset is not None and not details

    chrom, region_poses = region.split(':')
    region_start, _ = [int(pos) for pos in region_poses.split('-')]
    vcf_fname = (f'{ukb}/str_imputed/runs/{imputation_run_name}/'
                 f'vcfs/annotated_strs/chr{chrom}.vcf.gz')
    vcf = cyvcf2.VCF(vcf_fname)

    if details:
        yield ('motif', 'period', 'ref_len', 'total_per_allele_dosages',
               'total_hardcall_alleles', 'total_hardcall_genotypes',
               'subset_total_per_allele_dosages',
               'subset_total_hardcall_alleles',
               'subset_total_hardcall_genotypes', 'subset_het',
               'subset_entropy', 'subset_HWEP', 'subset_allele_dosage_r2')
    for record in vcf(region):
        if record.POS < region_start:
            # records that overlap this region but started before this region
            # should be considered part of the pervious region and not returned here
            continue
        if record.INFO.get('PERIOD') is None:
            # there are a few duplicate loci which I didn't handle
            # properly, this identifies and removes them
            continue

        if var_subset is not None and record.POS not in var_subset:
            continue

        trrecord = trh.HarmonizeRecord(vcfrecord=record,
                                       vcftype='beagle-hipstr')

        len_alleles = [trrecord.ref_allele_length
                       ] + trrecord.alt_allele_lengths
        len_alleles = [
            round(allele_len, allele_len_precision)
            for allele_len in len_alleles
        ]

        if hardcalls:
            yield (trrecord.GetLengthGenotypes()[samples, :-1],
                   np.unique(len_alleles), trrecord.chrom, trrecord.pos, None,
                   None)
            continue

        if details:
            total_dosages = {_len: 0 for _len in np.unique(len_alleles)}
            for p in (1, 2):
                ap = trrecord.format[f'AP{p}']
                total_dosages[len_alleles[0]] += np.sum(
                    np.maximum(0, 1 - np.sum(ap, axis=1)))
                for i in range(ap.shape[1]):
                    total_dosages[len_alleles[i + 1]] += np.sum(ap[:, i])

            total_hardcall_alleles = clean_len_alleles(
                trrecord.GetAlleleCounts())
            total_hardcall_genotypes = clean_len_allele_pairs(
                trrecord.GetGenotypeCounts())

        if isinstance(samples, slice):
            assert samples == slice(None)
            n_subset_samples = trrecord.GetNumSamples()
        else:
            n_subset_samples = int(np.sum(samples))

        subset_dosage_gts = {
            _len: np.zeros((n_subset_samples, 2))
            for _len in np.unique(len_alleles)
        }

        for p in (1, 2):
            # todo genotype dosages
            ap = trrecord.format[f'AP{p}']
            subset_dosage_gts[len_alleles[0]][:, (p-1)] += \
                    np.maximum(0, 1 - np.sum(ap[samples, :], axis=1))
            for i in range(ap.shape[1]):
                subset_dosage_gts[len_alleles[i + 1]][:,
                                                      (p - 1)] += ap[samples,
                                                                     i]

        subset_total_dosages = {
            _len: np.sum(subset_dosage_gts[_len])
            for _len in subset_dosage_gts
        }

        if details:
            subset_total_hardcall_alleles = clean_len_alleles(
                trrecord.GetAlleleCounts(samples))
            subset_total_hardcall_genotypes = clean_len_allele_pairs(
                trrecord.GetGenotypeCounts(samples))
            subset_hardcall_allele_freqs = clean_len_alleles(
                trrecord.GetAlleleFreqs(samples))

            subset_het = utils.GetHeterozygosity(subset_hardcall_allele_freqs)
            subset_entropy = utils.GetEntropy(subset_hardcall_allele_freqs)
            subset_hwep = utils.GetHardyWeinbergBinomialTest(
                subset_hardcall_allele_freqs, subset_total_hardcall_genotypes)

            # https://www.cell.com/ajhg/fulltext/S0002-9297(09)00012-3#app1
            # Browning, Brian L., and Sharon R. Browning. "A unified approach to genotype imputation and haplotype-phase inference for large data sets of trios and unrelated individuals." The American Journal of Human Genetics 84.2 (2009): 210-223.
            # appendix 1
            subset_allele_dosage_r2 = {}

            subset_hardcalls = np.around(
                trrecord.GetLengthGenotypes()[samples, :-1],
                allele_len_precision)
            for length in len_alleles:
                # calculate allele dosage r**2 for this length
                if length in subset_allele_dosage_r2:
                    continue

                calls = subset_hardcalls == length

                subset_allele_dosage_r2[length] = np.corrcoef(
                    calls.reshape(-1),
                    subset_dosage_gts[length].reshape(-1))[0, 1]**2

            locus_details = (trrecord.motif, str(len(trrecord.motif)),
                             str(
                                 round(trrecord.ref_allele_length,
                                       allele_len_precision)),
                             dict_str(
                                 round_vals(total_dosages, dosage_precision)),
                             dict_str(total_hardcall_alleles),
                             dict_str(total_hardcall_genotypes),
                             dict_str(
                                 round_vals(subset_total_dosages,
                                            dosage_precision)),
                             dict_str(subset_total_hardcall_alleles),
                             dict_str(subset_total_hardcall_genotypes),
                             str(subset_het), str(subset_entropy),
                             str(subset_hwep),
                             dict_str(
                                 round_vals(subset_allele_dosage_r2,
                                            r2_precision)))
        else:
            locus_details = None

        mac = list(subset_total_dosages.values())
        mac.pop(np.argmax(mac))

        if np.sum(mac) < 20:
            yield (None, np.unique(len_alleles), trrecord.chrom, trrecord.pos,
                   'MAC<20', locus_details)
            continue

        yield (subset_dosage_gts, np.unique(len_alleles), trrecord.chrom,
               trrecord.pos, None, locus_details)