Ejemplo n.º 1
0
def test_string_or_vcftype(vcfdir):
    assert (trh.HasLengthAltGenotypes("gangstr") == trh.HasLengthAltGenotypes(
        trh.VcfTypes.gangstr))
    assert (trh.HasLengthRefGenotype("gangstr") == trh.HasLengthRefGenotype(
        trh.VcfTypes.gangstr))
    assert (trh.MayHaveImpureRepeats("gangstr") == trh.MayHaveImpureRepeats(
        trh.VcfTypes.gangstr))
    reset_vcfs(vcfdir)
    assert (trh.HarmonizeRecord(
        "gangstr",
        next(gangstr_vcf)).GetMaxAllele() == len("tctgtctgtctg") / len("tctg"))
    assert (trh.HarmonizeRecord(
        trh.VcfTypes.gangstr,
        next(gangstr_vcf)).GetMaxAllele() == len("aaaacaaaacaaaacaaaac") /
            len("aaaac"))
Ejemplo n.º 2
0
def test_wrong_vcftype(vcfdir):
    # an iterator that includes both tr caller types
    # and error file types
    for correct_type in trh.VcfTypes:
        reset_vcfs(vcfdir)
        for incorrect_type in all_types():
            if incorrect_type == correct_type:
                # make sure the incorrect_type is actually incorrect
                continue

            invcf = get_vcf(incorrect_type)
            with pytest.raises(TypeError):
                print(correct_type, incorrect_type)
                trh.TRRecordHarmonizer(invcf, vcftype=correct_type)

        reset_vcfs(vcfdir)
        for incorrect_type in all_types():
            if incorrect_type == correct_type:
                # make sure the incorrect_type is actually incorrect
                continue

            invcf = get_vcf(incorrect_type)
            record = next(invcf)
            with pytest.raises(TypeError):
                print(correct_type, incorrect_type)
                trh.HarmonizeRecord(correct_type, record)
Ejemplo n.º 3
0
 def __call__(self, record):
     trrecord = trh.HarmonizeRecord(self.vcftype, record)
     het = utils.GetHeterozygosity(
         trrecord.GetAlleleFreqs(uselength=self.uselength))
     if het > self.threshold:
         return het
     return None
Ejemplo n.º 4
0
 def __call__(self, record):
     trrecord = trh.HarmonizeRecord(self.vcftype, record)
     allele_freqs = trrecord.GetAlleleFreqs(uselength=self.uselength)
     genotype_counts = trrecord.GetGenotypeCounts(uselength=self.uselength)
     hwep = utils.GetHardyWeinbergBinomialTest(allele_freqs,
                                               genotype_counts)
     if hwep < self.threshold: return hwep
     else: return None
Ejemplo n.º 5
0
def main_helper(output, str_imputation_run_name, chrom, all_white_brits_fname):
    vcf_fname = (f'{ukb}/str_imputed/runs/{str_imputation_run_name}/'
                 f'vcfs/annotated_strs/chr{chrom}.vcf.gz')
    vcf = cyvcf2.VCF(vcf_fname)

    subset_samples = []
    with open(all_white_brits_fname) as samp_file:
        next(samp_file)
        for line in samp_file:
            subset_samples.append(line.strip())

    all_samples = [l[0] for l in np.char.split(vcf.samples, '_')]
    samples = np.isin(all_samples, subset_samples)

    output.write(
        'chr\tpos\tallele_dist\tentropy\theterozygosity\tmultiallelicness\n')
    for record in vcf:
        if record.INFO.get('PERIOD') is None:
            continue
        trrecord = trh.HarmonizeRecord(vcfrecord=record,
                                       vcftype='beagle-hipstr')

        len_alleles = [trrecord.ref_allele_length
                       ] + trrecord.alt_allele_lengths
        total_subset_dosages = {len_: 0 for len_ in np.unique(len_alleles)}
        for p in (1, 2):
            ap = trrecord.format[f'AP{p}']
            total_subset_dosages[len_alleles[0]] += \
                    np.sum(np.maximum(0, 1 - np.sum(ap[samples, :], axis=1)))
            for i in range(ap.shape[1]):
                total_subset_dosages[len_alleles[i + 1]] += np.sum(ap[samples,
                                                                      i])

        for len_ in total_subset_dosages:
            total_subset_dosages[len_] /= np.sum(samples) * 2

        entropy = scipy.stats.entropy(list(total_subset_dosages.values()),
                                      base=2)
        heterozygosity = 1 - np.sum(val**2
                                    for val in total_subset_dosages.values())
        multiallelicness = sum(sorted(total_subset_dosages.values())[:-2])

        output.write(trrecord.chrom + '\t' + str(trrecord.pos) + '\t' +
                     str(total_subset_dosages) + '\t' + str(entropy) + '\t' +
                     str(heterozygosity) + '\t' + str(multiallelicness) + '\n')
Ejemplo n.º 6
0
def test_HarmonizeRecord(vcfdir):
    reset_vcfs(vcfdir)

    # Unknown type
    with pytest.raises(ValueError):
        trh.HarmonizeRecord("foo", next(snps_vcf))

    # Gangstr
    gangstr_trh = trh.TRRecordHarmonizer(gangstr_vcf)

    tr_rec1 = next(iter(gangstr_trh))
    assert tr_rec1.ref_allele == 'tctgtctgtctg'.upper()
    assert tr_rec1.alt_alleles == []
    assert tr_rec1.motif == 'tctg'.upper()
    assert not tr_rec1.HasFullStringGenotypes()
    assert not tr_rec1.HasFabricatedRefAllele()
    assert not tr_rec1.HasFabricatedAltAlleles()
    tr_rec2 = next(iter(gangstr_trh))
    tr_rec3 = next(iter(gangstr_trh))
    assert (tr_rec3.ref_allele ==
            'tgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtg'.upper())
    assert (tr_rec3.alt_alleles == [
        'tgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtg'.upper()
    ])
    assert tr_rec3.motif == 'tg'.upper()

    # hipstr
    hipstr_trh = trh.TRRecordHarmonizer(hipstr_vcf)

    str_iter = iter(hipstr_trh)
    tr_rec1 = next(str_iter)
    assert tr_rec1.ref_allele == 'GGTGGTGGTGGGGGCGGTGGGGGTGGTG'
    assert tr_rec1.alt_alleles == ['GGTGGTGGTGGGGGCGGTGGTGGTGCTG']
    assert tr_rec1.motif == 'GGT'
    assert tr_rec1.record_id == 'STR_2'
    assert not tr_rec1.HasFullStringGenotypes()
    assert not tr_rec1.HasFabricatedRefAllele()
    assert not tr_rec1.HasFabricatedAltAlleles()
    tr_rec2 = next(str_iter)
    tr_rec3 = next(str_iter)
    assert tr_rec3.ref_allele == 'TTTTTTTTTTTTTTT'
    assert tr_rec3.alt_alleles == []
    assert tr_rec3.motif == 'T'.upper()
    assert tr_rec3.record_id == 'STR_4'
    record = next(str_iter)
    while record.record_id != "STR_125":
        record = next(str_iter)
    assert record.HasFullStringGenotypes()
    # TODO this isn't really the correct behavior -
    # we're trimming off an extra repeat from the alt allele
    assert record.full_alleles == (
        "TGCATATATGTATAATATATATTATATATGGA",
        ["TCCATATATGCATAATATATATTATATATATG"],
    )
    assert (record.ref_allele == "ATATATGTATAATATATATTATATAT")
    assert (record.alt_alleles == ["ATATATGCATAATATATATTATATAT"])

    # popstr
    popstr_trh = trh.TRRecordHarmonizer(popstr_vcf)

    tr_rec1 = next(iter(popstr_trh))
    assert tr_rec1.ref_allele == 'GGGGGGGCGGGGGGGGGG'
    assert tr_rec1.alt_alleles == ['G' * 14, 'G' * 17]
    assert tr_rec1.motif == 'G'
    assert tr_rec1.record_id == 'chr21:5020351:M'
    assert not tr_rec1.HasFullStringGenotypes()
    assert not tr_rec1.HasFabricatedRefAllele()
    assert tr_rec1.HasFabricatedAltAlleles()
    tr_rec2 = next(iter(popstr_trh))
    tr_rec3 = next(iter(popstr_trh))
    assert tr_rec3.ref_allele == 'TTTTTTTTTTTTTTTTTTTTTT'
    assert tr_rec3.alt_alleles == ['T' * 21]
    assert tr_rec3.motif == 'T'
    assert tr_rec3.record_id == 'chr21:5031126:M'

    # advntr
    advntr_trh = trh.TRRecordHarmonizer(advntr_vcf)

    tr_rec1 = next(iter(advntr_trh))
    assert tr_rec1.ref_allele == 'GCGCGGGGCGGGGCGCGGGGCGGGGCGCGGGGCGGG'
    assert tr_rec1.alt_alleles == [
        'GCGCGGGGCGGGGCGCGGGGCGGG',
        'GCGCGGGGCGGGGCGCGGGGCGGGGCGCGGGGCGGGGCGCGGGGCGGGGCGCGGGGCGGG'
    ]
    assert tr_rec1.motif == 'GCGCGGGGCGGG'
    assert not tr_rec1.HasFullStringGenotypes()
    assert not tr_rec1.HasFabricatedRefAllele()
    assert not tr_rec1.HasFabricatedAltAlleles()

    # advntr
    eh_trh = trh.TRRecordHarmonizer(eh_vcf)

    tr_rec1 = next(iter(eh_trh))
    motif = 'CAG'
    assert tr_rec1.ref_allele == motif * 19
    assert tr_rec1.alt_alleles == [motif * 16, motif * 18]
    assert tr_rec1.motif == motif
    assert tr_rec1.record_id == 'HTT'
    assert not tr_rec1.HasFullStringGenotypes()
    assert tr_rec1.HasFabricatedRefAllele()
    assert tr_rec1.HasFabricatedAltAlleles()
Ejemplo n.º 7
0
def main(args):
    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING(
            "Error: The directory which contains the output location {} does"
            " not exist".format(args.out))
        return 1

    if os.path.isdir(args.out) and args.out.endswith(os.sep):
        common.WARNING("Error: The output location {} is a "
                       "directory".format(args.out))
        return 1

    ### Check and load VCF files ###
    vcfreaders = utils.LoadReaders([args.vcf1, args.vcf2],
                                   checkgz=True,
                                   region=args.region)
    if vcfreaders is None or len(vcfreaders) != 2:
        return 1
    contigs = vcfreaders[0].contigs
    chroms = list(contigs)

    ### Load shared samples ###
    samples = mergeutils.GetSharedSamples(vcfreaders)
    if len(samples) == 0:
        common.WARNING("No shared smaples found between vcf readers")
        return 1
    if args.samples:
        usesamples = set(
            [item.strip() for item in open(args.samples, "r").readlines()])
        samples = list(set(samples).intersection(usesamples))
    if len(samples) == 0:
        common.WARNING("No shared samples found between files")
        return 1

    ### Determine FORMAT fields we should look for ###
    if args.stratify_file is not None and args.stratify_file not in [0, 1, 2]:
        common.MSG("--stratify-file must be 0,1, or 2")
        return 1
    format_fields, format_binsizes = GetFormatFields(args.stratify_fields,
                                                     args.stratify_binsizes,
                                                     args.stratify_file,
                                                     vcfreaders)

    ### Keep track of data to summarize at the end ###
    results_dir = {
        "chrom": [],
        "start": [],
        "period": [],
        "sample": [],
        "gtstring1": [],
        "gtstring2": [],
        "gtsum1": [],
        "gtsum2": [],
        "metric-conc-seq": [],
        "metric-conc-len": [],
    }
    for ff in format_fields:
        results_dir[ff + "1"] = []
        results_dir[ff + "2"] = []

    vcftype1 = trh.GetVCFType(vcfreaders[0], args.vcftype1)
    vcftype2 = trh.GetVCFType(vcfreaders[1], args.vcftype2)

    ### Walk through sorted readers, merging records as we go ###
    current_records = [next(reader) for reader in vcfreaders]
    is_min = mergeutils.GetMinRecords(current_records, chroms)

    done = mergeutils.DoneReading(current_records)
    num_records = 0
    while not done:
        if any([item is None for item in current_records]): break
        if args.numrecords is not None and num_records >= args.numrecords:
            break
        if args.verbose:
            mergeutils.DebugPrintRecordLocations(current_records, is_min)
        if mergeutils.CheckMin(is_min): return 1
        if all([is_min]):
            if (current_records[0].CHROM == current_records[1].CHROM and \
                current_records[0].POS == current_records[1].POS):
                UpdateComparisonResults(trh.HarmonizeRecord(vcftype1, current_records[0]), \
                                        trh.HarmonizeRecord(vcftype2, current_records[1]), \
                                        format_fields, samples, results_dir)
        current_records = mergeutils.GetNextRecords(vcfreaders,
                                                    current_records, is_min)
        is_min = mergeutils.GetMinRecords(current_records, chroms)
        done = mergeutils.DoneReading(current_records)
        num_records += 1

    ### Load all results to a dataframe and output full results ###
    data = pd.DataFrame(results_dir)
    data.to_csv(args.out + "-callcompare.tab", sep="\t", index=False)

    ### Overall metrics ###
    OutputOverallMetrics(data, format_fields, format_binsizes,
                         args.stratify_file, args.period, args.out)
    if not args.noplot:
        OutputBubblePlot(data,
                         args.period,
                         args.out,
                         minval=args.bubble_min,
                         maxval=args.bubble_max)

    ### Per-locus metrics ###
    OutputLocusMetrics(data, args.out, args.noplot)

    ### Per-sample metrics ###
    OutputSampleMetrics(data, args.out, args.noplot)

    return 0
Ejemplo n.º 8
0
def main(args):
    # Load VCF file
    invcf = utils.LoadSingleReader(args.vcf, checkgz=False)
    if invcf is None:
        return 1

    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING(
            "Error: The directory which contains the output location {} does"
            " not exist".format(args.out))
        return 1

    if os.path.isdir(args.out) and args.out.endswith(os.sep):
        common.WARNING("Error: The output location {} is a "
                       "directory".format(args.out))
        return 1

    # Set up record harmonizer and infer VCF type
    vcftype = trh.InferVCFType(invcf, args.vcftype)

    # Check filters all make sense
    if not CheckFilters(invcf, args, vcftype): return 1

    # Set up locus-level filter list
    try:
        filter_list = BuildLocusFilters(args, vcftype)
    except ValueError:
        return 1
    filter_list = BuildLocusFilters(args, vcftype)
    invcf.filters = {}
    for f in filter_list:
        short_doc = f.__doc__ or ''
        short_doc = short_doc.split('\n')[0].lstrip()
        invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc)

    # Set up call-level filters
    call_filters = BuildCallFilters(args)

    # Add new FORMAT fields
    if "FILTER" not in invcf.formats:
        invcf.formats["FILTER"] = _Format("FILTER", 1, "String",
                                          "Call-level filter")

    # Add new INFO fields
    invcf.infos["AC"] = _Info("AC",
                              -1,
                              "Integer",
                              "Alternate allele counts",
                              source=None,
                              version=None)
    invcf.infos["REFAC"] = _Info("REFAC",
                                 1,
                                 "Integer",
                                 "Reference allele count",
                                 source=None,
                                 version=None)
    invcf.infos["HET"] = _Info("HET",
                               1,
                               "Float",
                               "Heterozygosity",
                               source=None,
                               version=None)
    invcf.infos["HWEP"] = _Info("HWEP",
                                1,
                                "Float",
                                "HWE p-value for obs. vs. exp het rate",
                                source=None,
                                version=None)
    invcf.infos["HRUN"] = _Info("HRUN",
                                1,
                                "Integer",
                                "Length of longest homopolymer run",
                                source=None,
                                version=None)

    # Set up output files
    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING("Output directory does not exist")
        return 1
    outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv))
    if outvcf is None: return 1

    # Set up sample info
    all_reasons = GetAllCallFilters(call_filters)
    sample_info = {}
    for s in invcf.samples:
        sample_info[s] = {"numcalls": 0, "totaldp": 0}
        for r in all_reasons:
            sample_info[s][r] = 0

    # Set up locus info
    loc_info = {"totalcalls": 0, "PASS": 0}
    for filt in filter_list:
        loc_info[filt.filter_name()] = 0

    # Go through each record
    record_counter = 0
    while True:
        try:
            record = next(invcf)
        except IndexError:
            common.WARNING(
                "Skipping TR that couldn't be parsed by PyVCF. Check VCF format"
            )
            if args.die_on_warning: return 1
        except StopIteration:
            break
        if args.verbose:
            common.MSG("Processing %s:%s" % (record.CHROM, record.POS))
        record_counter += 1
        if args.num_records is not None and record_counter > args.num_records:
            break
        # Call-level filters
        record = ApplyCallFilters(record, invcf, call_filters, sample_info)

        # Locus-level filters
        record.FILTER = None
        output_record = True
        for filt in filter_list:
            if filt(record) == None: continue
            if args.drop_filtered:
                output_record = False
                break
            record.add_filter(filt.filter_name())
            loc_info[filt.filter_name()] += 1
        if args.drop_filtered:
            if record.call_rate == 0: output_record = False
        if output_record:
            trrecord = trh.HarmonizeRecord(vcftype, record)
            # Recalculate locus-level INFO fields
            record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF)
            if record.num_called > 0:
                allele_freqs = trrecord.GetAlleleFreqs(
                    uselength=args.use_length)
                genotype_counts = trrecord.GetGenotypeCounts(
                    uselength=args.use_length)
                record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs)
                record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest(
                    allele_freqs, genotype_counts)
                record.INFO["AC"] = [
                    int(item * (3 * record.num_called)) for item in record.aaf
                ]
                record.INFO["REFAC"] = int(
                    (1 - sum(record.aaf)) * (2 * record.num_called))
            else:
                record.INFO["HET"] = -1
                record.INFO["HWEP"] = -1
                record.INFO["AC"] = [0] * len(record.ALT)
                record.INFO["REFAC"] = 0
            # Recalc filter
            if record.FILTER is None and not args.drop_filtered:
                record.FILTER = "PASS"
                loc_info["PASS"] += 1
                loc_info["totalcalls"] += record.num_called
            # Output the record
            outvcf.write_record(record)

    # Output log info
    WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab")
    WriteLocLog(loc_info, args.out + ".loclog.tab")

    return 0
Ejemplo n.º 9
0
def load_strs(imputation_run_name: str,
              region: str,
              samples: np.ndarray,
              details: bool = True,
              var_subset: Optional[Set[int]] = None,
              hardcalls=False):
    """
    Iterate over a region returning genotypes at STR loci.

    First yield is a tuple of names of the fields in details.
    Every subsequent yield is described in the yields section below.

    Parameters
    ----------
    imputation_run_name:
        which imputation run to load genotypes from?
    region:
        chr:start-end
    samples:
        A boolean array of length nsamples determining which samples are included
        (True) and which are not

    Yields
    ------
    dosages: Dict[float, np.ndarray]
        A dictionary from unique length alleles to 2D arrays of size (n_samples, 2)
        which contain the dosages of those alleles for each haplotype
        Length dosage are measured in number of repeats.

        None if locus_filtered is not None
    
        If hardcalls, then instead of that just an array nx2 of length alleles
    unique_alleles: np.ndarray
        Array of unique length alleles (measured in number of repeats),
        same length as the dosages dict
    chrom: str
        e.g. '13'
    pos: int
    locus_filtered:
        None if the locus is not filtered, otherwise
        a string explaining why.
        'MAC<20' if the minor allele dosage is less than 20
        after sample subsetting, per plink's standard.

        None if hardcalls.
    locus_details:
        tuple of strings with the same length as the first yield
        with the corresponding order.

        None if hardcalls.
        

    Notes
    -----
    Hardcalls mentioned in the locus details are phased hardcalls, and in some
    corner cases will not correspond to the maximum likelihood unphased allele.
    """

    if hardcalls:
        assert var_subset is not None and not details

    chrom, region_poses = region.split(':')
    region_start, _ = [int(pos) for pos in region_poses.split('-')]
    vcf_fname = (f'{ukb}/str_imputed/runs/{imputation_run_name}/'
                 f'vcfs/annotated_strs/chr{chrom}.vcf.gz')
    vcf = cyvcf2.VCF(vcf_fname)

    if details:
        yield ('motif', 'period', 'ref_len', 'total_per_allele_dosages',
               'total_hardcall_alleles', 'total_hardcall_genotypes',
               'subset_total_per_allele_dosages',
               'subset_total_hardcall_alleles',
               'subset_total_hardcall_genotypes', 'subset_het',
               'subset_entropy', 'subset_HWEP', 'subset_allele_dosage_r2')
    for record in vcf(region):
        if record.POS < region_start:
            # records that overlap this region but started before this region
            # should be considered part of the pervious region and not returned here
            continue
        if record.INFO.get('PERIOD') is None:
            # there are a few duplicate loci which I didn't handle
            # properly, this identifies and removes them
            continue

        if var_subset is not None and record.POS not in var_subset:
            continue

        trrecord = trh.HarmonizeRecord(vcfrecord=record,
                                       vcftype='beagle-hipstr')

        len_alleles = [trrecord.ref_allele_length
                       ] + trrecord.alt_allele_lengths
        len_alleles = [
            round(allele_len, allele_len_precision)
            for allele_len in len_alleles
        ]

        if hardcalls:
            yield (trrecord.GetLengthGenotypes()[samples, :-1],
                   np.unique(len_alleles), trrecord.chrom, trrecord.pos, None,
                   None)
            continue

        if details:
            total_dosages = {_len: 0 for _len in np.unique(len_alleles)}
            for p in (1, 2):
                ap = trrecord.format[f'AP{p}']
                total_dosages[len_alleles[0]] += np.sum(
                    np.maximum(0, 1 - np.sum(ap, axis=1)))
                for i in range(ap.shape[1]):
                    total_dosages[len_alleles[i + 1]] += np.sum(ap[:, i])

            total_hardcall_alleles = clean_len_alleles(
                trrecord.GetAlleleCounts())
            total_hardcall_genotypes = clean_len_allele_pairs(
                trrecord.GetGenotypeCounts())

        if isinstance(samples, slice):
            assert samples == slice(None)
            n_subset_samples = trrecord.GetNumSamples()
        else:
            n_subset_samples = int(np.sum(samples))

        subset_dosage_gts = {
            _len: np.zeros((n_subset_samples, 2))
            for _len in np.unique(len_alleles)
        }

        for p in (1, 2):
            # todo genotype dosages
            ap = trrecord.format[f'AP{p}']
            subset_dosage_gts[len_alleles[0]][:, (p-1)] += \
                    np.maximum(0, 1 - np.sum(ap[samples, :], axis=1))
            for i in range(ap.shape[1]):
                subset_dosage_gts[len_alleles[i + 1]][:,
                                                      (p - 1)] += ap[samples,
                                                                     i]

        subset_total_dosages = {
            _len: np.sum(subset_dosage_gts[_len])
            for _len in subset_dosage_gts
        }

        if details:
            subset_total_hardcall_alleles = clean_len_alleles(
                trrecord.GetAlleleCounts(samples))
            subset_total_hardcall_genotypes = clean_len_allele_pairs(
                trrecord.GetGenotypeCounts(samples))
            subset_hardcall_allele_freqs = clean_len_alleles(
                trrecord.GetAlleleFreqs(samples))

            subset_het = utils.GetHeterozygosity(subset_hardcall_allele_freqs)
            subset_entropy = utils.GetEntropy(subset_hardcall_allele_freqs)
            subset_hwep = utils.GetHardyWeinbergBinomialTest(
                subset_hardcall_allele_freqs, subset_total_hardcall_genotypes)

            # https://www.cell.com/ajhg/fulltext/S0002-9297(09)00012-3#app1
            # Browning, Brian L., and Sharon R. Browning. "A unified approach to genotype imputation and haplotype-phase inference for large data sets of trios and unrelated individuals." The American Journal of Human Genetics 84.2 (2009): 210-223.
            # appendix 1
            subset_allele_dosage_r2 = {}

            subset_hardcalls = np.around(
                trrecord.GetLengthGenotypes()[samples, :-1],
                allele_len_precision)
            for length in len_alleles:
                # calculate allele dosage r**2 for this length
                if length in subset_allele_dosage_r2:
                    continue

                calls = subset_hardcalls == length

                subset_allele_dosage_r2[length] = np.corrcoef(
                    calls.reshape(-1),
                    subset_dosage_gts[length].reshape(-1))[0, 1]**2

            locus_details = (trrecord.motif, str(len(trrecord.motif)),
                             str(
                                 round(trrecord.ref_allele_length,
                                       allele_len_precision)),
                             dict_str(
                                 round_vals(total_dosages, dosage_precision)),
                             dict_str(total_hardcall_alleles),
                             dict_str(total_hardcall_genotypes),
                             dict_str(
                                 round_vals(subset_total_dosages,
                                            dosage_precision)),
                             dict_str(subset_total_hardcall_alleles),
                             dict_str(subset_total_hardcall_genotypes),
                             str(subset_het), str(subset_entropy),
                             str(subset_hwep),
                             dict_str(
                                 round_vals(subset_allele_dosage_r2,
                                            r2_precision)))
        else:
            locus_details = None

        mac = list(subset_total_dosages.values())
        mac.pop(np.argmax(mac))

        if np.sum(mac) < 20:
            yield (None, np.unique(len_alleles), trrecord.chrom, trrecord.pos,
                   'MAC<20', locus_details)
            continue

        yield (subset_dosage_gts, np.unique(len_alleles), trrecord.chrom,
               trrecord.pos, None, locus_details)
    merged_arr = utils.merge_arrays(samples_array, pheno_data)
    unfiltered_subset = ~np.isnan(merged_arr[:, 1])
    n_samples = np.sum(unfiltered_subset)

    vcf = cyvcf2.VCF(args.imputed_vcf)
    found_rec = False
    for record in vcf(f'{args.chrom}:{args.pos}-{args.pos}'):
        if record.POS < args.pos:
            continue
        if record.INFO.get('PERIOD') is None:
            continue

        assert not found_rec
        found_rec = True

        trrecord = trh.HarmonizeRecord(vcfrecord=record, vcftype='beagle-hipstr')

        len_alleles = [trrecord.ref_allele_length] + trrecord.alt_allele_lengths
        len_alleles = [round(allele_len, 2) for allele_len in len_alleles]

        ap1 = trrecord.format['AP1']
        ap1 = np.concatenate((1 - np.sum(ap1, axis=1).reshape(-1, 1), ap1), axis=1)
        ap2 = trrecord.format['AP2']
        ap2 = np.concatenate((1 - np.sum(ap2, axis=1).reshape(-1, 1), ap2), axis=1)

    # TODO this needs better testing
    subset_summed_dosages = {}
    for aidx1, len_allele1 in enumerate(len_alleles):
        for aidx2, len_allele2 in enumerate(len_alleles):
            summed_len = len_allele1 + len_allele2
            if summed_len not in subset_summed_dosages:
Ejemplo n.º 11
0
def main(args):
    if not os.path.exists(args.vcf):
        common.WARNING("Error: %s does not exist" % args.vcf)
        return 1

    if not os.path.exists(os.path.dirname(os.path.abspath(args.out))):
        common.WARNING(
            "Error: The directory which contains the output location {} does"
            " not exist".format(args.out))
        return 1

    if os.path.isdir(args.out) and args.out.endswith(os.sep):
        common.WARNING("Error: The output location {} is a "
                       "directory".format(args.out))
        return 1

    # Load samples
    sample_lists = []
    sample_prefixes = []
    if args.samples:
        sfiles = args.samples.split(",")
        if args.sample_prefixes:
            sample_prefixes = args.sample_prefixes.split(",")
        else:
            sample_prefixes = [str(item) for item in range(1, len(sfiles) + 1)]
        if len(sfiles) != len(sample_prefixes):
            common.MSG("--sample-prefixes must be same length as --samples")
            return 1
        for sf in sfiles:
            sample_lists.append(
                [item.strip() for item in open(sf, "r").readlines()])

    invcf = utils.LoadSingleReader(args.vcf, checkgz=False)
    if invcf is None:
        return 1
    if args.vcftype != 'auto':
        vcftype = trh.VcfTypes[args.vcftype]
    else:
        vcftype = trh.InferVCFType(invcf)

    header = ["chrom", "start", "end"]
    if args.thresh: header.extend(GetHeader("thresh", sample_prefixes))
    if args.afreq: header.extend(GetHeader("afreq", sample_prefixes))
    if args.acount: header.extend(GetHeader("acount", sample_prefixes))
    if args.hwep: header.extend(GetHeader("hwep", sample_prefixes))
    if args.het: header.extend(GetHeader("het", sample_prefixes))
    if args.mean: header.extend(GetHeader("mean", sample_prefixes))
    if args.mode: header.extend(GetHeader("mode", sample_prefixes))
    if args.var: header.extend(GetHeader("var", sample_prefixes))
    if args.numcalled: header.extend(GetHeader("numcalled", sample_prefixes))
    if args.out == "stdout":
        if args.plot_afreq:
            common.MSG("Cannot use --out stdout when generating plots")
            return 1
        outf = sys.stdout
    else:
        outf = open(args.out + ".tab", "w")
    outf.write("\t".join(header) + "\n")

    if args.region:
        if not os.path.isfile(args.vcf + ".tbi"):
            common.MSG("Make sure %s is bgzipped and indexed" % args.vcf)
            return 1
        regions = invcf.fetch(args.region)
    else:
        regions = invcf
    num_plotted = 0
    for record in regions:
        trrecord = trh.HarmonizeRecord(vcftype, record)
        if args.plot_afreq and num_plotted <= MAXPLOTS:
            PlotAlleleFreqs(trrecord,
                            args.out,
                            samplelists=sample_lists,
                            sampleprefixes=sample_prefixes)
            num_plotted += 1
        items = [
            record.CHROM, record.POS, record.POS + len(trrecord.ref_allele)
        ]
        if args.thresh:
            items.extend(GetThresh(trrecord, samplelists=sample_lists))
        if args.afreq:
            items.extend(
                GetAFreq(trrecord,
                         samplelists=sample_lists,
                         uselength=args.use_length))
        if args.acount:
            items.extend(
                GetAFreq(trrecord,
                         samplelists=sample_lists,
                         uselength=args.use_length,
                         count=True))
        if args.hwep:
            items.extend(
                GetHWEP(trrecord,
                        samplelists=sample_lists,
                        uselength=args.use_length))
        if args.het:
            items.extend(
                GetHet(trrecord,
                       samplelists=sample_lists,
                       uselength=args.use_length))
        if args.mean:
            items.extend(GetMean(trrecord, samplelists=sample_lists))
        if args.mode:
            items.extend(GetMode(trrecord, samplelists=sample_lists))
        if args.var:
            items.extend(GetVariance(trrecord, samplelists=sample_lists))
        if args.numcalled:
            items.extend(GetNumSamples(trrecord, samplelists=sample_lists))
        outf.write("\t".join([str(item) for item in items]) + "\n")
    outf.close()
    return 0
Ejemplo n.º 12
0
def main():
    # do an association test of the combined dosage of the listed alleles vs the listed phenotypes in each ethnicity
    parser = argparse.ArgumentParser()
    #parser.add_argument('chrom', type=int)
    #parser.add_argument('pos', type=int)
    parser.add_argument('var_file')
    #parser.add_argument('--phenotypes', nargs='+')
    #parser.add_argument('--alleles', type=int, nargs='+') #allele indicies, i.e. ths SNP is present in alleles 0 (ref), 2 (2nd alt) and 5
    args = parser.parse_args()

    print(
        'str\tvariant\tethnicity\tvar frequency\tstr var r2\tphenotype\tstr p-val\tvar p-val\tstr p-val conditioning on var'
    )

    scovs = np.load(f'{ukb}/traits/shared_covars/shared_covars.npy')

    variants = pl.read_csv(args.var_file, sep='\t')
    for i in range(variants.shape[0]):
        chrom = variants[i, 'chrom']
        pos = variants[i, 'pos']
        str_ = f'{chrom}:{pos}'
        var = next(
            cyvcf2.VCF(
                f'{ukb}/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chrom}.vcf.gz'
            )(str_))
        alleles = [int(num) for num in variants[i, 'alleles'].split(',')]
        name = variants[i, 'name']
        phenotypes = variants[i, 'phenos'].split(',')
        for phenotype in phenotypes:
            for ethnicity in ('white_brits', 'black', 'south_asian', 'chinese',
                              'irish', 'white_other'):
                #ethnicity
                total_samp_idx = sample_utils.get_samples_idx_ethnicity(
                    ethnicity)
                total_var_gts = var_dosage_gts(var, total_samp_idx, alleles)
                total_str_gts = str_dosage_gts(var, total_samp_idx)
                var_freq = np.sum(total_var_gts) / (2 * total_var_gts.shape[0])
                corr = np.corrcoef(total_var_gts, total_str_gts)
                assert corr.shape == (2, 2)
                str_var_r2 = corr[0, 1]**2
                #ethnicity,pheno
                pcovs = np.load(
                    f'{ukb}/traits/subset_transformed_phenotypes/{ethnicity}/{phenotype}.npy'
                )
                samps = sample_utils.get_ordered_samples_phenotype(
                    ethnicity, phenotype).reshape(-1, 1)
                covs = python_array_utils.merge_arrays(
                    python_array_utils.merge_arrays(samps, pcovs), scovs)

                outcomes = covs[:, 1]
                covs = covs[:, 2:]

                samp_idx = sample_utils.get_samples_idx_phenotype(
                    ethnicity, phenotype)
                var_gts = standardize(var_dosage_gts(var, samp_idx, alleles))
                str_gts = standardize(str_dosage_gts(var, samp_idx))
                str_best_guess_gts = trh.HarmonizeRecord(
                    vcfrecord=var,
                    vcftype='beagle-hipstr').GetGenotypeIndicies()[
                        samp_idx, :-1]

                str_p = OLS(
                    outcomes,
                    np.hstack((covs, np.ones((covs.shape[0], 1)),
                               str_gts.reshape(-1, 1)))).fit().pvalues[-1]

                if np.all(var_gts == 0) or np.all(var_gts == 2):
                    var_p = 1
                    str_cond_p = str_p
                else:
                    var_p = OLS(
                        outcomes,
                        np.hstack((covs, np.ones((covs.shape[0], 1)),
                                   var_gts.reshape(-1, 1)))).fit().pvalues[-1]
                    str_cond_p = OLS(
                        outcomes,
                        np.hstack((covs, np.ones(
                            (covs.shape[0], 1)), var_gts.reshape(-1, 1),
                                   str_gts.reshape(-1, 1)))).fit().pvalues[-1]

                print(
                    f'{str_}\t{name}\t{ethnicity}\t{var_freq:.3g}\t{str_var_r2:.3g}\t{phenotype}\t{str_p:.3g}\t{var_p:.3g}\t{str_cond_p:.3g}'
                )