Esempio n. 1
0
def setRefPos(variant, seq_handler, padding=200):
    """
    Add start and end attributes in VCFRecord. For insertions the start is defined on the first position before the insertion and the end on the last position affected by the insertion.

    :param variant: The variant to update.
    :type variant: anacore.vcf.VCFRecord
    """
    if variant.ref == VCFRecord.getEmptyAlleleMarker() or variant.alt[
            0] == VCFRecord.getEmptyAlleleMarker():  # Normalized indel
        # Most upstream
        variant.upstream_start, variant.upstream_end = getStartEnd(variant)
        # Most downstream
        sub_region = seq_handler.getSub(
            variant.chrom, variant.pos - 2,
            variant.pos + len(variant.ref) + padding)
        chrom_pos = variant.pos
        variant.pos = 3  # Switch position from chromosome to position from subregion
        downstream_var = variant.getMostDownstream(sub_region)
        variant.pos = chrom_pos + variant.pos - 3  # Switch position from subregion to position from chromosome
        downstream_var.pos = variant.pos
        variant.downstream_start, variant.downstream_end = getStartEnd(
            downstream_var)
    else:
        variant.upstream_start, variant.upstream_end = getStartEnd(variant)
        variant.downstream_start = variant.upstream_start
        variant.downstream_end = variant.upstream_end
Esempio n. 2
0
def getSupportingReads(var, chrom_seq, FH_aln, log):
    """
    Return read ID of reads supporting the altenative variant.

    :param var: The variant.
    :type var: anacore.vcf.VCFRecord updated with iniVariant() and isIns
    :param chrom_seq: The sequence of the chromosome.
    :type chrom_seq: str
    :param FH_aln: The file handle to the alignments file. The variants must have been defined from this alignments file.
    :type FH_aln: pysam.AlignmentFile
    :param log: The logger object.
    :type log: logging.Logger
    :return: The list of supporting reads IDs.
    :rtype: set
    """
    supporting_reads = set()
    is_insertion = var.isInsertion()
    for read in FH_aln.fetch(var.chrom, var.upstream_start - 1, var.downstream_end):
        if not read.is_duplicate:
            reads_pos = read.get_reference_positions()
            if len(reads_pos) != 0:  # Skip alignment with problem
                ref_start = reads_pos[0] + 1  # 0-based to 1-based
                ref_end = reads_pos[-1] + 1  # 0-based to 1-based
                overlap_var = (ref_start <= var.upstream_start and ref_end >= var.downstream_end)
                if overlap_var:
                    ref_aln, read_aln = getAlnCmp(read, chrom_seq[ref_start - 1:ref_end])
                    var_alt = var.alt[0].upper().replace(VCFRecord.getEmptyAlleleMarker(), "")
                    var_ref = var.ref.upper().replace(VCFRecord.getEmptyAlleleMarker(), "")
                    # Test with upstream coordinates
                    ref, alt = getReadRefAlt(ref_aln, read_aln, ref_start, is_insertion, var.upstream_start, var.upstream_end)
                    if "".join(alt).upper() == var_alt and "".join(ref).upper() == var_ref:  # The alternative is present on most upstream coordinates
                        log.debug("{}\t{}/{}\t'{}'\t'{}'\t{}".format(read.query_name, var.ref, var.alt[0], "".join(ref), "".join(alt), read.cigarstring))
                        supporting_reads.add(read.query_name)  # Fragment is overlapping if at least one of his read is ovelapping
                    # Test with downstream coordinates
                    elif var.upstream_start != var.downstream_start:
                        ref, alt = getReadRefAlt(ref_aln, read_aln, ref_start, is_insertion, var.downstream_start, var.downstream_end)
                        if "".join(alt).upper() == var_alt and "".join(ref).upper() == var_ref:  # The alternative is present on most downstream coordinates
                            log.debug("{}\t{}/{}\t'{}'\t'{}'\t{}".format(read.query_name, var.ref, var.alt[0], "".join(ref), "".join(alt), read.cigarstring))
                            supporting_reads.add(read.query_name)  # Fragment is overlapping if at least one of his read is ovelapping
    return supporting_reads
Esempio n. 3
0
def mergedRecord(vcf, first, first_std_name, second, second_std_name, FH_seq):
    """
    Return the VCFRecord corresponding to the merge of first and second.

    :param vcf: The file handle to VCF.
    :type vcf: anacore.vcf.VCFIO
    :param first: The upstream variant to merge.
    :type first: anacore.vcf.VCFRecord
    :param first_std_name: The initial name of the upstream variant to merge (before normalisation).
    :type first_std_name: str
    :param second: The downstream variant to merge.
    :type second: anacore.vcf.VCFRecord
    :param second_std_name: The initial name of the downstream variant to merge (before normalisation).
    :type second_std_name: str
    :param FH_seq: File handle to the refersence sequence file.
    :type FH_seq: IdxFastaIO
    :return: The variant corresponding to the merge of first and second.
    :rtype: anacore.vcf.VCFRecord
    :todo: Keep INFO and format on strand from FreeBayes, VarDict, ...
    """
    merged = VCFRecord(
        first.chrom,  # chrom
        first.pos,  # pos
        pFormat=first.format)
    # Ref and Alt
    first_end = int(round(first.refEnd() - 0.49, 0))
    second_start = int(round(second.refStart() + 0.49, 0))
    ref_add = ""
    if second_start - first_end > 0:
        ref_add = FH_seq.getSub(first.chrom, first_end + 1, second_start - 1)
    merged.ref = first.ref + ref_add + second.ref
    merged.ref = merged.ref.replace(VCFRecord.getEmptyAlleleMarker(), "")
    merged.alt = [first.alt[0] + ref_add + second.alt[0]]
    merged.alt[0] = merged.alt[0].replace(VCFRecord.getEmptyAlleleMarker(), "")
    # Filter
    first_filters = [] if first.filter is None else first.filter
    second_filters = [] if second.filter is None else second.filter
    merged.filter = list(set(first_filters + second_filters))
    if len(merged.filter) > 1 and "PASS" in merged.filter:
        merged.filter.remove("PASS")
    # Samples
    for spl in first.samples:
        merged.samples[spl] = {}
        if "DP" in first.format:
            merged.samples[spl]["DP"] = min(first.getDP(spl),
                                            second.getDP(spl))
        if "AD" in first.format:
            if vcf.format["AD"].number == "1":  # Contains one alt allele
                merged.samples[spl]["AD"] = min(first.samples[spl]["AD"],
                                                second.samples[spl]["AD"])
            else:
                merged.samples[spl]["AD"] = [
                    min(first_AD, second_AD) for first_AD, second_AD in zip(
                        first.samples[spl]["AD"], second.samples[spl]["AD"])
                ]
        if "AF" in first.format:
            if vcf.format["AF"].number == "1":  # Contains one alt allele
                merged.samples[spl]["AF"] = min(first.samples[spl]["AF"],
                                                second.samples[spl]["AF"])
            else:
                merged.samples[spl]["AF"] = [
                    min(first_AF, second_AF) for first_AF, second_AF in zip(
                        first.samples[spl]["AF"], second.samples[spl]["AF"])
                ]
    # INFO metrics
    if "AD" in first.info:
        if vcf.info["AD"].number == "1":  # Contains one alt allele
            merged.info["AD"] = merged.getPopAltAD()[0]
        elif vcf.info["AD"].number == "R":  # Contains ref and alt alleles
            merged.info["AD"] = [merged.getPopRefAD()] + merged.getPopAltAD()
        else:  # Contains only alt alleles
            merged.info["AD"] = merged.getPopAltAD()
    if "DP" in first.info:
        merged.info["DP"] = merged.getPopDP()
    if "AF" in first.info:
        if vcf.info["AF"].number == "1":  # Contains one alt allele
            merged.info["AF"] = merged.getPopAltAF()[0]
        elif vcf.info["AF"].number == "R":  # Contains ref and alt alleles
            merged.info["AF"] = [merged.getPopRefAF()] + merged.getPopAltAF()
        else:  # Contains only alt alleles
            merged.info["AF"] = merged.getPopAltAF()
    # INFO Parents
    merged.info["MCO_VAR"] = []
    if "MCO_VAR" in first.info:
        for parent in first.info["MCO_VAR"]:
            merged.info["MCO_VAR"].append(parent)
    else:
        merged.info["MCO_VAR"].append(first_std_name)
    if "MCO_VAR" in second.info:
        for parent in second.info["MCO_VAR"]:
            merged.info["MCO_VAR"].append(parent)
    else:
        merged.info["MCO_VAR"].append(second_std_name)
    # Quality
    merged.info["MCO_QUAL"] = []
    if "MCO_QUAL" in first.info:
        for qual in first.info["MCO_QUAL"]:
            merged.info["MCO_QUAL"].append(qual)
    else:
        merged.info["MCO_QUAL"].append(first.qual)
    if "MCO_QUAL" in second.info:
        for qual in second.info["MCO_QUAL"]:
            merged.info["MCO_QUAL"].append(qual)
    else:
        merged.info["MCO_QUAL"].append(second.qual)
    if None not in merged.info["MCO_QUAL"]:
        merged.qual = mean(merged.info["MCO_QUAL"])
    # Return
    return merged
Esempio n. 4
0
        '-o',
        '--output-variants',
        required=True,
        help='The path to the outputted file (format: VCF).')
    args = parser.parse_args()

    # Process
    curr_chrom = {"name": "", "seq": None}
    with VCFIO(args.output_variants, "w") as FH_out_vcf:
        with VCFIO(args.input_variants) as FH_in_vcf:
            # Header
            FH_out_vcf.copyHeader(FH_in_vcf)
            FH_out_vcf.writeHeader()
            # Records
            for record in FH_in_vcf:
                if record.ref == VCFRecord.getEmptyAlleleMarker() or any([
                        alt == VCFRecord.getEmptyAlleleMarker()
                        for alt in record.alt
                ]):  # record is a standardized in/del
                    # Get previous nt
                    if record.chrom != curr_chrom["name"]:
                        curr_chrom["name"] = record.chrom
                        curr_chrom["seq"] = getChromSeq(
                            record.chrom, args.input_reference)
                    prev_nt = curr_chrom["seq"][record.pos - 2]
                    # Update record
                    record.pos -= 1
                    if record.ref == VCFRecord.getEmptyAlleleMarker(
                    ):  # Insertion
                        record.ref = prev_nt
                    else:  # Deletion