예제 #1
0
def changeCosmicAnnotations(record, annot_field, cosmic_reader):
    """
    Replace non-allele specific COSMIC annotations produced by VEP to allelle-specific annotations.

    .. warning::
    Alleles not annotated by VEP will remain unannotated despite data in databank.

    :param record: Annotated VCF record from VEP.
    :type record: anacore.vcf.VCFRecord
    :param annot_field: Field used to store annotations.
    :type annot_field: str
    :param cosmic_reader: File handler open on COSMIC databank with mode 'i'.
    :type cosmic_reader: anacore.vcf.VCFIO
    """
    annot_chr = record.chrom.upper()
    annot_name_prefix = "{}:{}={}/".format(
        annot_chr[3:] if annot_chr.startswith("CHR") else annot_chr,
        record.pos, record.ref.upper())
    # Get overlapping COSMIC variants
    db_overlapping = [
        elt for elt in cosmic_reader.getSub(
            record.chrom[3:] if record.chrom.startswith("chr") else record.
            chrom, int(record.refStart()), int(record.refEnd() + 0.5))
    ]
    # Replace COSMIC annotations
    for annot in record.info[annot_field]:
        annot_name = annot_name_prefix + annot["Allele"].upper()
        new_existing = []
        # Remove old COSMIC annotations
        if annot["Existing_variation"] is not None:
            for curr_exist in annot["Existing_variation"].split("&"):
                if not curr_exist.startswith("COS"):
                    new_existing.append(curr_exist)
        # Add new COSMIC annotations
        db_ids = set()
        for db_record in db_overlapping:
            if len(db_record.alt) == 1:
                if annot_name == db_record.getName().upper():
                    db_ids = db_ids | set(db_record.id.split(";"))
            else:
                for alt_idx, db_alt in enumerate(db_record.alt):
                    db_alt_record = getAlleleRecord(cosmic_reader, db_record,
                                                    alt_idx)
                    if annot_name == db_alt_record.getName().upper():
                        db_ids = db_ids | set(db_record.id.split(";"))
        new_existing += sorted(db_ids)
        # Change existing variants
        if len(new_existing) != 0:
            annot["Existing_variation"] = "&".join(new_existing)
        else:
            annot["Existing_variation"] = None
예제 #2
0
def normAndMove(genome_path, in_variant_file, out_variant_file,
                trace_unstandard):
    """
    Write in a new file the normalized version of each variant. The normalization constists in three steps:
      1- The variants with multiple alternative alleles are splitted in one record by alternative allele.
      2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.).
      3- The allele is replaced by the most upstream allele that can have the same alternative sequence (example: a deletion in homopolymer is moved to first nucleotid of this homopolymer).

    :param genome_path: Path to the genome file (format: fasta).
    :type genome_path: str
    :param in_variant_file: Path to the variants file (format: VCF).
    :type in_variant_file: str
    :param out_variant_file: Path to the normalized variants file (format: VCF).
    :type out_variant_file: str
    :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO.
    :type trace_unstandard: bool
    """
    genome_by_chr = getSeqByChr(genome_path)
    with VCFIO(out_variant_file, "w") as FH_out:
        with VCFIO(in_variant_file) as FH_in:
            # Header
            FH_out.copyHeader(FH_in)
            if trace_unstandard:
                FH_out.info["UNSTD"] = HeaderInfoAttr(
                    "UNSTD",
                    type="String",
                    number="1",
                    description=
                    "The variant id (chromosome:position=reference/alternative) before standardization."
                )
            FH_out.writeHeader()
            # Records
            for record in FH_in:
                curr_chrom = genome_by_chr[record.chrom]
                for alt_idx, alt in enumerate(record.alt):
                    alt_record = getAlleleRecord(FH_in, record, alt_idx)
                    if trace_unstandard:
                        alt_record.info["UNSTD"] = "{}:{}={}/{}".format(
                            alt_record.chrom, alt_record.pos, alt_record.ref,
                            "/".join(alt_record.alt))
                    FH_out.write(alt_record.getMostUpstream(curr_chrom))
예제 #3
0
def normOnly(in_variant_file, out_variant_file, trace_unstandard):
    """
    Write in a new file the normalized version of each variant. The normalization constists in two steps:
      1- The variants with multiple alternative alleles are splitted in one record by alternative allele.
      2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.).

    :param in_variant_file: Path to the variants file (format: VCF).
    :type in_variant_file: str
    :param out_variant_file: Path to the normalized variants file (format: VCF).
    :type out_variant_file: str
    :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO.
    :type trace_unstandard: bool
    """
    with VCFIO(out_variant_file, "w") as FH_out:
        with VCFIO(in_variant_file) as FH_in:
            # Header
            FH_out.copyHeader(FH_in)
            if trace_unstandard:
                FH_out.info["UNSTD"] = HeaderInfoAttr(
                    "UNSTD",
                    type="String",
                    number="1",
                    description=
                    "The variant id (chromosome:position=reference/alternative) before standardization."
                )
            FH_out.writeHeader()
            # Records
            for record in FH_in:
                for alt_idx, alt in enumerate(record.alt):
                    alt_record = getAlleleRecord(FH_in, record, alt_idx)
                    if trace_unstandard:
                        alt_record.info["UNSTD"] = "{}:{}={}/{}".format(
                            alt_record.chrom, alt_record.pos, alt_record.ref,
                            "/".join(alt_record.alt))
                    alt_record.normalizeSingleAllele()
                    FH_out.write(alt_record)
def groupBNDByFusions(bnd_by_id, annotation_field):
    """
    Return by chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second).

    :param bnd_by_id: Breakend by ID coming from one fusion caller.
    :type bnd_by_id: dict
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :return: By chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second).
    :rtype: dict
    """
    caller_fusions = dict()
    processed_fusions = set()
    fusion_by_name = {}
    for id, record in bnd_by_id.items():
        for alt_idx, alt in enumerate(record.alt):
            alt_first_bnd = record
            first_new_id = alt_first_bnd.id
            if len(record.alt) > 1:
                first_new_id += "_" + str(
                    alt_idx)  # Record must be splitted for each mate
                alt_first_bnd = getAlleleRecord(record, alt_idx)
                alt_first_bnd.info["MATEID"] = [record.info["MATEID"][alt_idx]]
            mate_id = alt_first_bnd.info["MATEID"][0]
            mate_record = bnd_by_id[mate_id]
            alt_second_bnd = mate_record
            second_new_id = alt_second_bnd.id
            if len(mate_record.alt) > 1:
                first_idx = mate_record.info["MATEID"].index(alt_first_bnd.id)
                second_new_id += "_" + first_idx  # Record must be splitted for each mate
                alt_second_bnd = getAlleleRecord(mate_record, first_idx)
                alt_second_bnd.info["MATEID"] = [
                    mate_record.info["MATEID"][first_idx]
                ]
            fusion_id = " @@ ".join(
                sorted([alt_first_bnd.id, alt_second_bnd.id]))
            alt_first_bnd.id = first_new_id
            alt_second_bnd.info["MATEID"] = [first_new_id]
            alt_second_bnd.id = second_new_id
            alt_first_bnd.info["MATEID"] = [second_new_id]
            if fusion_id not in processed_fusions:
                processed_fusions.add(fusion_id)
                if "RNA_FIRST" not in alt_first_bnd.info and "RNA_FIRST" not in alt_second_bnd.info:
                    raise Exception(
                        "Tag RNA_FIRST must be present in one of the breakend {} or {}."
                        .format(alt_first_bnd.id, mate_id))
                if "RNA_FIRST" in alt_second_bnd.info:
                    aux = alt_first_bnd
                    alt_first_bnd = alt_second_bnd
                    alt_second_bnd = aux
                interval_first_bnd = getBNDInterval(alt_first_bnd)
                fusion_name = " @@ ".join(
                    sorted([alt_first_bnd.getName(),
                            alt_second_bnd.getName()]))
                if fusion_name not in fusion_by_name:
                    region_first_bnd = Region(interval_first_bnd[0],
                                              interval_first_bnd[1],
                                              reference=alt_first_bnd.chrom,
                                              annot={
                                                  "first": alt_first_bnd,
                                                  "second": alt_second_bnd
                                              })
                    if alt_first_bnd.chrom not in caller_fusions:
                        caller_fusions[alt_first_bnd.chrom] = RegionList()
                    caller_fusions[alt_first_bnd.chrom].append(
                        region_first_bnd)
                    fusion_by_name[fusion_name] = region_first_bnd
                else:  # Caller contains several entries for the same pair of breakends (same fusion but several anotations)
                    fusion_by_name[fusion_name].annot["first"].info[
                        annotation_field] += alt_first_bnd.info[
                            annotation_field]
                    fusion_by_name[fusion_name].annot["second"].info[
                        annotation_field] += alt_second_bnd.info[
                            annotation_field]
    return caller_fusions
예제 #5
0
 primers_by_chr = getPrimersByChr(args.input_regions)
 chr_seq = None
 with VCFIO(args.input_variants) as FH_in:
     with VCFIO(args.output_variants, "w") as FH_out:
         # Header
         FH_out.copyHeader(FH_in)
         FH_out.filter["PRIM"] = HeaderFilterAttr(
             'PRIM',
             'The variant is located on an amplicon primer (amplicon desgin: '
             + args.input_regions + ').')
         FH_out.writeHeader()
         # Records
         for record in FH_in:
             for alt_idx, alt in enumerate(record.alt):
                 is_kept = True
                 alt_record = getAlleleRecord(FH_in, record, alt_idx)
                 alt_region = getVariantRegion(alt_record)
                 overlapped_primers = primers_by_chr[
                     alt_record.chrom].getOverlapped(alt_region)
                 if len(
                         overlapped_primers
                 ) > 0:  # The variant overlaps a primer or variant is an insertion just before the downstream primer
                     is_kept = False
                     if len(overlapped_primers
                            ) == 1:  # Variants over 2 primers are removed
                         if alt_record.isIndel():
                             if chr_seq is None or chr_seq.id != alt_record.chrom:
                                 chr_seq = getSeqRecord(
                                     args.input_sequences, alt_record.chrom)
                             if canBeMovedToInterest(
                                     overlapped_primers[0], chr_seq.string,
예제 #6
0
def addVCFVariants(variants, vcf_path, vcf_idx, spl_name=None):
    """
    Add variant from VCF in dict.

    :param variants: By uniq ID the variants. The content of this variable is set by the call of this function.
                     Content example:
                     {
                       "chr1:10=A/T":{
                         "chrom":"chr1",
                         "pos":10,
                         "ref":"A",
                         "alt":"T",
                         "freq":[0.2, 0.5] },
                       "chr1:10=A/G":{
                         "chrom":"chr1",
                         "pos":10,
                         "ref":"A",
                         "alt":"G",
                         "freq":[0.01, 0] },
                       "chr3:20=G/T":{
                         "chrom":"chr3",
                         "pos":20,
                         "ref":"G",
                         "alt":"T",
                         "freq":[0, 0.4] }
                     }
                     The list of frequencies is appended by each call of the function with a vcf_idx different.
    :type variants: dict
    :param vcf_path: Path to the VCF file to add.
    :type vcf_path: str
    :param vcf_idx: Index used to store the frequency of each vrariants of the VCF in frequencies list (start from 0).
    :type vcf_idx: int
    :param spl_name: The frequency of the variants came from this sample. This parameters is optional when the VCF file contain 0 to 1 sample.
    :type spl_name: str
    """
    with VCFIO(vcf_path) as FH_vcf:
        if spl_name is None:
            spl_name = FH_vcf.samples[0]
        for record in FH_vcf:
            allele_freq = record.getAltAF(spl_name)
            # For each alternative allele
            for idx_alt, alt in enumerate(record.alt):
                allele_record = getAlleleRecord(FH_vcf, record, idx_alt)
                allele_record.normalizeSingleAllele()
                variant_id = allele_record.getName()
                if variant_id not in variants:
                    variants[variant_id] = {
                        "chrom": allele_record.chrom,
                        "pos": allele_record.pos,
                        "ref": allele_record.ref,
                        "alt": allele_record.alt[0],
                        "freq": list()
                    }
                # Complete variants missing in previous VCF
                while len(variants[variant_id]["freq"]) <= vcf_idx:
                    variants[variant_id]["freq"].append(0)
                # Add allele frequency
                variants[variant_id]["freq"][vcf_idx] = allele_freq[idx_alt]
    # Complete variants missing in current VCF
    for variant_id in variants:
        while len(variants[variant_id]["freq"]) <= vcf_idx:
            variants[variant_id]["freq"].append(0)
예제 #7
0
    # Process
    nb_variants = 0
    nb_filtered = 0
    noise_by_variant = getNoise(args.input_noises)
    with VCFIO(args.input_variants) as FH_in:
        with VCFIO(args.output_variants, "w") as FH_out:
            # Header
            FH_out.copyHeader(FH_in)
            FH_out.filter[args.tag_name] = HeaderFilterAttr(args.tag_name, args.tag_description)
            FH_out.writeHeader()
            # Records
            for record in FH_in:
                for idx in range(len(record.alt)):
                    nb_variants += 1
                    curr_allele = getAlleleRecord(FH_in, record, idx)
                    # Compare signal to noise
                    if curr_allele.getName() in noise_by_variant:
                        nb_spl_over_noise = 0
                        for curr_spl in curr_allele.samples:
                            if curr_allele.getAltAF(curr_spl)[0] > noise_by_variant[curr_allele.getName()]:
                                nb_spl_over_noise += 1
                        if nb_spl_over_noise == 0:
                            nb_filtered += 1
                            if curr_allele.filter is None or len(curr_allele.filter) == 0:
                                curr_allele.filter = [args.tag_name]
                            else:
                                if "PASS" in curr_allele.filter:
                                    curr_allele.filter.remove("PASS")
                                curr_allele.filter.append(args.tag_name)
                    # Update empty filter
예제 #8
0
def stdizeVCF(FH_ref, FH_in, FH_out, trace_unstandard=False, log=None):
    """
    Split alternatives alleles in multi-lines, removes unecessary reference and alternative nucleotids, move indel to most upstream position and update alt allele in annotations.

    :param FH_ref: File handle to the reference file (format: fasta with faidx).
    :type FH_ref: anacore.sequenceIO.IdxFastaIO
    :param FH_in: File handle to the variants file (format: VCF).
    :type FH_in: anacore.vcf.VCFIO
    :param FH_out: File handle to the standardized variants file (format: VCF).
    :type FH_out: anacore.vcf.VCFIO
    :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO.
    :type trace_unstandard: bool
    :param log: Logger used.
    :type log: logging.Logger
    """
    nb_annot = {"exact": 0, "collocated": 0}
    is_annotated = issubclass(FH_out.__class__, AnnotVCFIO)
    # Header
    FH_out.copyHeader(FH_in)
    if trace_unstandard:
        FH_out.info["UNSTD"] = HeaderInfoAttr(
            "UNSTD",
            type="String",
            number="1",
            description=
            "The variant id (chromosome:position=reference/alternative) before standardization."
        )
    FH_out.writeHeader()
    # Records
    for record in FH_in:
        collocated_records = []
        for alt_idx, alt in enumerate(record.alt):
            alt_record = getAlleleRecord(FH_in, record, alt_idx)
            if trace_unstandard:
                alt_record.info["UNSTD"] = alt_record.getName()
            # Previous
            unstd = {
                "chrom": alt_record.chrom,
                "pos": alt_record.pos,
                "ref": alt_record.ref,
                "alt": alt_record.alt[0]
            }
            # Standardize pos, ref and alt
            alt_record.fastStandardize(FH_ref, 1000)
            # Update annotations
            if is_annotated and FH_in.annot_field in alt_record.info:
                cleaned_annot = []
                for idx_ann, annot in enumerate(
                        alt_record.info[FH_in.annot_field]):
                    if unstd["alt"] == annot["Allele"]:
                        nb_annot["exact"] += 1
                        annot["Allele"] = alt_record.alt[0]
                        cleaned_annot.append(annot)
                    else:
                        nb_annot["collocated"] += 1
                alt_record.info[FH_in.annot_field] = cleaned_annot
            collocated_records.append(alt_record)
        if len(collocated_records) == 1:
            FH_out.write(collocated_records[0])
        else:
            for alt_record in sorted(
                    collocated_records,
                    key=lambda elt:
                (elt.refStart(), elt.refEnd())):  # Sorted splitted alleleles
                FH_out.write(alt_record)
    if log is not None and nb_annot["collocated"] != 0:
        log.warning(
            "{}/{} annotations have been deleted because they concern collocated variant."
            .format(nb_annot["collocated"],
                    nb_annot["exact"] + nb_annot["collocated"]))
예제 #9
0
    for vcf_idx, current_vcf in enumerate(args.input_variants):
        current_aln = None
        if not args.deactivate_completion:
            current_aln = args.input_aln[vcf_idx]
        with VCFIO(current_vcf) as FH_vcf:
            # Manage samples
            for curr_spl in FH_vcf.samples:  # For each sample in VCF
                aln_by_samples[curr_spl] = current_aln
            # Manage records
            for record in FH_vcf:  # For each variant
                if args.selected_region is None or record.chrom == args.selected_region:
                    for curr_spl in FH_vcf.samples:  # For each sample in VCF
                        vcaller_AF = record.getAltAF(curr_spl)
                        vcaller_DP = record.getDP(curr_spl)
                        for alt_idx, curr_alt in enumerate(record.alt):  # For each alternative allele in in variant
                            record_allele = getAlleleRecord(FH_vcf, record, alt_idx)
                            # Get allele frequency from the variant caller
                            vcaller_curr_AF = vcaller_AF[alt_idx]
                            if len(vcaller_AF) == len(record.alt) + 1:  # The AF cointains reference AF
                                vcaller_curr_AF = vcaller_AF[alt_idx + 1]
                            record_allele.samples[curr_spl]["AF"] = [round(vcaller_curr_AF, args.AF_precision)]
                            record_allele.samples[curr_spl]["AD"] = [int(vcaller_curr_AF * vcaller_DP)]
                            record_allele.samples[curr_spl]["DP"] = vcaller_DP
                            # Store allele
                            allele_id = record_allele.getName()
                            if allele_id not in variants:
                                variants[allele_id] = record_allele
                            else:
                                variants[allele_id].samples[curr_spl] = record_allele.samples[curr_spl]

    # Completes and writes variants