Esempio n. 1
0
def add_snp(snp, position, sequence):
    if snp.alt != ".":
        genotype = "".join(vcfwrap.get_ml_genotype(snp))
        encoded = sequences.encode_genotype(genotype)
    else:
        encoded = snp.ref
    sequence[position] = encoded
Esempio n. 2
0
def add_snp(snp, position, sequence):
    if snp.alt != ".":
        genotype = "".join(vcfwrap.get_ml_genotype(snp))
        encoded = sequences.encode_genotype(genotype)
    else:
        encoded = snp.ref
    sequence[position] = encoded
def main(argv):
    if len(argv) != 2:
        sys.stderr.write("Usage: %s <BED-file> <VCF.bgz>\n")
        return 1

    sites = 0
    sites_non_ref = 0
    sites_homo_non_ref = 0
    sites_het_one_non_ref = 0
    sites_het_two_non_ref = 0

    vcf_records = pysam.Tabixfile(argv[1])
    bed_records = read_bed_records(argv[0])

    for record in select_vcf_records(bed_records, vcf_records):
        if record.alt != '.':
            # Get the most likely diploid genotype
            nt_a, nt_b = get_ml_genotype(record)
            if (nt_a, nt_b) == ('N', 'N'):
                # Skip sites with no most likely genotype
                continue

            sites += 1
            sites_non_ref += 1
            if nt_a == nt_b:
                sites_homo_non_ref += 1
            elif record.ref not in (nt_a, nt_b):
                sites_het_two_non_ref += 1
            else:
                sites_het_one_non_ref += 1
        else:
            # Heterozygous for the reference allele
            sites += 1

    print
    print "%i sites kept after filtering:" % (sites, )
    print " % 10i homozygous sites containing the reference allele (%.2f%%)" % (
        sites - sites_non_ref, 100.0 * (sites - sites_non_ref) / float(sites))
    print " % 10i heterozygous sites containing the reference and a non-reference allele (%.2f%%)" % (
        sites_het_one_non_ref, (100.0 * sites_het_one_non_ref) / sites)
    print " % 10i homozygous sites containing a single non-reference allele (%.2f%%)" % (
        sites_homo_non_ref, (100.0 * sites_homo_non_ref) / sites)
    print " % 10i heterozygous sites containing two different non-reference alleles (%.2f%%)" % (
        sites_het_two_non_ref, (100.0 * sites_het_two_non_ref) / sites)
Esempio n. 4
0
def add_indel(options, bed, indel, sequence):
    if indel.alt == ".":
        return

    genotype = vcfwrap.get_ml_genotype(indel)
    if genotype[0] != genotype[1]:
        # No way to represent heterozygous indels
        return
    elif genotype[0] == "N":
        # No most likely genotype
        return

    # Note that bed.end is a past-the-end coordinate
    start = max(0, bed.start - options.padding)

    # FIXME: parse_indel only supports a single 'alt' values
    indel.alt = genotype[0]
    indel = vcfwrap.parse_indel(indel)
    if indel.in_reference:
        del_start = max(indel.pos + 1, bed.start)
        del_end = min(indel.pos + 1 + len(indel.what), bed.end)

        if del_start >= del_end:
            # Deletion does not cover any bases of interest
            return
        elif options.whole_codon_indels_only:
            if (del_end - del_start) % 3:
                # Non-codon sized overlap with area of interest
                return

        for position in range(del_start, del_end):
            sequence[position - start] = ""
    elif (len(indel.what) % 3 == 0) or not options.whole_codon_indels_only:
        # parse_indel assumes that the insertion is always the first possible
        # base when multiple positions are possible. As a consequence, the
        # position may be before start, with the rest of the bases overlapping
        # the current sequence. For example:
        #  ref = ATTT
        #  alt = ATTTT
        # It is assumed that the insertion (_) happened thus:
        #  interpretation = A_TTT
        if indel.pos >= start:
            sequence[indel.pos - start] += indel.what
Esempio n. 5
0
def add_indel(options, bed, indel, sequence):
    if indel.alt == ".":
        return

    genotype = vcfwrap.get_ml_genotype(indel)
    if genotype[0] != genotype[1]:
        # No way to represent heterozygous indels
        return
    elif genotype[0] == "N":
        # No most likely genotype
        return

    # Note that bed.end is a past-the-end coordinate
    start = max(0, bed.start - options.padding)

    # FIXME: parse_indel only supports a single 'alt' values
    indel.alt = genotype[0]
    indel = vcfwrap.parse_indel(indel)
    if indel.in_reference:
        del_start = max(indel.pos + 1, bed.start)
        del_end = min(indel.pos + 1 + len(indel.what), bed.end)

        if del_start >= del_end:
            # Deletion does not cover any bases of interest
            return
        elif options.whole_codon_indels_only:
            if (del_end - del_start) % 3:
                # Non-codon sized overlap with area of interest
                return

        for position in range(del_start, del_end):
            sequence[position - start] = ""
    elif (len(indel.what) % 3 == 0) or not options.whole_codon_indels_only:
        # parse_indel assumes that the insertion is always the first possible
        # base when multiple positions are possible. As a consequence, the
        # position may be before start, with the rest of the bases overlapping
        # the current sequence. For example:
        #  ref = ATTT
        #  alt = ATTTT
        # It is assumed that the insertion (_) happened thus:
        #  interpretation = A_TTT
        if indel.pos >= start:
            sequence[indel.pos - start] += indel.what
def main(argv):
    if len(argv) != 2:
        sys.stderr.write("Usage: %s <BED-file> <VCF.bgz>\n")
        return 1

    sites                 = 0
    sites_non_ref         = 0
    sites_homo_non_ref    = 0
    sites_het_one_non_ref = 0
    sites_het_two_non_ref = 0

    vcf_records = pysam.Tabixfile(argv[1])
    bed_records = read_bed_records(argv[0])

    for record in select_vcf_records(bed_records, vcf_records):
        if record.alt != '.':
            # Get the most likely diploid genotype
            nt_a, nt_b = get_ml_genotype(record)
            if (nt_a, nt_b) == ('N', 'N'):
                # Skip sites with no most likely genotype
                continue

            sites += 1
            sites_non_ref += 1
            if nt_a == nt_b:
                sites_homo_non_ref += 1
            elif record.ref not in (nt_a, nt_b):
                sites_het_two_non_ref += 1
            else:
                sites_het_one_non_ref += 1
        else:
            # Heterozygous for the reference allele
            sites += 1

    print
    print "%i sites kept after filtering:" % (sites,)
    print " % 10i homozygous sites containing the reference allele (%.2f%%)" % (sites - sites_non_ref, 100.0 * (sites - sites_non_ref) / float(sites))
    print " % 10i heterozygous sites containing the reference and a non-reference allele (%.2f%%)" % (sites_het_one_non_ref, (100.0 * sites_het_one_non_ref) / sites)
    print " % 10i homozygous sites containing a single non-reference allele (%.2f%%)" % (sites_homo_non_ref, (100.0 * sites_homo_non_ref) / sites)
    print " % 10i heterozygous sites containing two different non-reference alleles (%.2f%%)" % (sites_het_two_non_ref, (100.0 * sites_het_two_non_ref) / sites)
Esempio n. 7
0
def _filter_by_properties(options, vcfs, frequencies):
    """Filters a list of SNPs/indels based on the various properties recorded in
    the info column, and others. This mirrors most of the filtering carried out
    by vcfutils.pl varFilter."""
    for vcf in vcfs:
        if float(vcf.qual) < options.min_quality:
            _mark_as_filtered(vcf, "q:%i" % options.min_quality)

        properties = {}
        for field in vcf.info.split(";"):
            if "=" in field:
                key, value = field.split("=")
            else:
                key, value = field, None
            properties[key] = value

        read_depth = float(properties["DP"])
        if options.min_read_depth > read_depth:
            _mark_as_filtered(vcf, "d:%i" % options.min_read_depth)
        elif options.max_read_depth < read_depth:
            _mark_as_filtered(vcf, "D:%i" % options.max_read_depth)

        if "MQ" in properties:
            if float(properties["MQ"]) < options.min_mapping_quality:
                _mark_as_filtered(vcf, "Q:%i" % options.min_mapping_quality)

        if "PV4" in properties:
            pv4 = [float(value) for value in properties["PV4"].split(",")]
            if (pv4[0] < options.min_strand_bias):
                _mark_as_filtered(vcf, "1:%e" % options.min_strand_bias)
            if (pv4[1] < options.min_baseq_bias):
                _mark_as_filtered(vcf, "2:%e" % options.min_baseq_bias)
            if  (pv4[2] < options.min_mapq_bias):
                _mark_as_filtered(vcf, "3:%e" % options.min_mapq_bias)
            if (pv4[3] < options.min_end_distance_bias):
                _mark_as_filtered(vcf, "4:%e" % options.min_end_distance_bias)

        if vcf.alt != ".":
            ref_fw, ref_rev, alt_fw, alt_rev = map(int, properties["DP4"].split(","))
            if (alt_fw + alt_rev) < options.min_num_alt_bases:
                _mark_as_filtered(vcf, "a:%i" % options.min_num_alt_bases)

            ml_genotype = vcfwrap.get_ml_genotype(vcf)
            if (ml_genotype == ("N", "N")) and not options.keep_ambigious_genotypes:
                # No most likely genotype
                _mark_as_filtered(vcf, "k")

            if (ml_genotype[0] != ml_genotype[1]):
                if vcf.contig in options.homozygous_chromosome:
                    _mark_as_filtered(vcf, "HET")

                # Filter by frequency of minor allele
                if vcf.ref in ml_genotype:
                    n_minor = min(ref_fw + ref_rev, alt_fw + alt_rev)
                    n_major = max(ref_fw + ref_rev, alt_fw + alt_rev)

                    if (n_minor / float(n_minor + n_major)) < options.min_allele_frequency:
                        _mark_as_filtered(vcf, "f:%.4f" % options.min_allele_frequency)
                else:
                    state = frequencies.frequency_is_valid(vcf.contig, vcf.pos, vcf.ref, *ml_genotype)
                    if state is frequencies.INVALID:
                        _mark_as_filtered(vcf, "f:%.4f" % options.min_allele_frequency)
                    elif state is frequencies.NA:
                        if _mark_as_filtered(vcf, "F:%.4f" % options.min_allele_frequency):
                            sys.stderr.write("WARNING: Could not determine allele-counts for SNP at %s:%s, filtering ...\n" % (vcf.contig, vcf.pos + 1))
Esempio n. 8
0
def _filter_by_properties(options, vcfs, frequencies):
    """Filters a list of SNPs/indels based on the various properties recorded in
    the info column, and others. This mirrors most of the filtering carried out
    by vcfutils.pl varFilter."""
    for vcf in vcfs:
        if float(vcf.qual) < options.min_quality:
            _mark_as_filtered(vcf, "q:%i" % options.min_quality)

        properties = {}
        for field in vcf.info.split(";"):
            if "=" in field:
                key, value = field.split("=")
            else:
                key, value = field, None
            properties[key] = value

        read_depth = float(properties["DP"])
        if options.min_read_depth > read_depth:
            _mark_as_filtered(vcf, "d:%i" % options.min_read_depth)
        elif options.max_read_depth < read_depth:
            _mark_as_filtered(vcf, "D:%i" % options.max_read_depth)

        if "MQ" in properties:
            if float(properties["MQ"]) < options.min_mapping_quality:
                _mark_as_filtered(vcf, "Q:%i" % options.min_mapping_quality)

        if "PV4" in properties:
            pv4 = [float(value) for value in properties["PV4"].split(",")]
            if (pv4[0] < options.min_strand_bias):
                _mark_as_filtered(vcf, "1:%e" % options.min_strand_bias)
            if (pv4[1] < options.min_baseq_bias):
                _mark_as_filtered(vcf, "2:%e" % options.min_baseq_bias)
            if (pv4[2] < options.min_mapq_bias):
                _mark_as_filtered(vcf, "3:%e" % options.min_mapq_bias)
            if (pv4[3] < options.min_end_distance_bias):
                _mark_as_filtered(vcf, "4:%e" % options.min_end_distance_bias)

        if vcf.alt != ".":
            ref_fw, ref_rev, alt_fw, alt_rev = map(
                int, properties["DP4"].split(","))
            if (alt_fw + alt_rev) < options.min_num_alt_bases:
                _mark_as_filtered(vcf, "a:%i" % options.min_num_alt_bases)

            ml_genotype = vcfwrap.get_ml_genotype(vcf)
            if (ml_genotype
                    == ("N", "N")) and not options.keep_ambigious_genotypes:
                # No most likely genotype
                _mark_as_filtered(vcf, "k")

            if (ml_genotype[0] != ml_genotype[1]):
                if vcf.contig in options.homozygous_chromosome:
                    _mark_as_filtered(vcf, "HET")

                # Filter by frequency of minor allele
                if vcf.ref in ml_genotype:
                    n_minor = min(ref_fw + ref_rev, alt_fw + alt_rev)
                    n_major = max(ref_fw + ref_rev, alt_fw + alt_rev)

                    if (n_minor / float(n_minor + n_major)
                        ) < options.min_allele_frequency:
                        _mark_as_filtered(
                            vcf, "f:%.4f" % options.min_allele_frequency)
                else:
                    state = frequencies.frequency_is_valid(
                        vcf.contig, vcf.pos, vcf.ref, *ml_genotype)
                    if state is frequencies.INVALID:
                        _mark_as_filtered(
                            vcf, "f:%.4f" % options.min_allele_frequency)
                    elif state is frequencies.NA:
                        if _mark_as_filtered(
                                vcf, "F:%.4f" % options.min_allele_frequency):
                            sys.stderr.write(
                                "WARNING: Could not determine allele-counts for SNP at %s:%s, filtering ...\n"
                                % (vcf.contig, vcf.pos + 1))