Esempio n. 1
0
def setTranscriptsAnnotByOverlap(queries, transcripts):
    """
    Annotate each query by the information coming from the transcripts overlapping them.

    :param region: Regions to annotate.
    :type region: anacore.region.Region
    :param transcripts: The list of transcripts where overlapped transcripts will be searched.
    :type transcripts: anacore.region.RegionList
    """
    transcripts_by_chr = splittedByRef(transcripts)
    queries_by_chr = splittedByRef(queries)
    for chrom, curr_query, overlapped_subjects in iterOverlappedByRegion(queries_by_chr, transcripts_by_chr):
        curr_query.annot["ANN"] = getTranscriptsAnnot(curr_query, overlapped_subjects)
Esempio n. 2
0
def setVariantsByOverlap(queries, variants):
    """
    Annotate each query by the list of variants overlapping them.

    :param queries: Regions to annotate.
    :type queries: anacore.region.Region
    :param variants: The list of variants where overlapped variants will be searched.
    :type variants: anacore.region.RegionList
    """
    variants_by_chr = splittedByRef(variants)
    queries_by_chr = splittedByRef(queries)
    for chrom, curr_query, overlapped_subjects in iterOverlappedByRegion(queries_by_chr, variants_by_chr):
        curr_query.annot["VAR"] = []
        for sbjct in overlapped_subjects:
            curr_query.annot["VAR"].append(sbjct)
def getMergedRecords(inputs_variants, calling_sources, annotation_field,
                     shared_filters):
    """
    Merge VCFRecords coming from several variant callers.

    :param inputs_variants: Pathes to the variants files.
    :type inputs_variants: list
    :param calling_sources: Names of the variants callers (in same order as inputs_variants).
    :type calling_sources: list
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :param shared_filters: Filters tags applying to the variant and independent of caller like filters on annotations. These filters are not renamed to add caller ID as suffix.
    :type shared_filters: set
    :return: Merged VCF records.
    :rtype: list
    """
    whole_fusions = {}  # first bnd region by chromosome
    for idx_in, curr_in in enumerate(inputs_variants):
        curr_caller = calling_sources[idx_in]
        log.info("Process {}".format(curr_caller))
        # breakend by id
        bnd_by_id = loadBNDByID(curr_in)
        # Group by fusion
        curr_caller_fusions = groupBNDByFusions(bnd_by_id, annotation_field)
        # Merge to other callers
        new_fusions = []
        for chrom, query, overlapped in iterOverlappedByRegion(
                curr_caller_fusions, whole_fusions):
            records = (query.annot["first"], query.annot["second"])
            # Extract PR and SR
            support_by_spl = {}
            for spl, data in records[0].samples.items():
                support_by_spl[spl] = {
                    "PR": getCount(data, "PR"),
                    "SR": getCount(data, "SR")
                }
            # Get identical fusion from previous callers
            prev_records = getPrevFusion(records, overlapped, curr_caller)
            # Rename fields
            for curr_record in records:
                renameFields(curr_record, "s{}".format(idx_in), shared_filters)
            # Add to storage
            if prev_records is None:  # Prepare new fusion
                new_fusions.append(query)
                for curr_record in records:
                    # Data source
                    curr_record.info["SRC"] = [curr_caller]
                    curr_record.info["REFSRC"] = curr_caller
                    curr_record.info["IDSRC"] = [curr_record.id]
                    # CIPOS
                    if "s{}_CIPOS".format(idx_in) in curr_record.info:
                        curr_record.info["CIPOS"] = curr_record.info[
                            "s{}_CIPOS".format(idx_in)]
                    # Quality
                    if idx_in != 0:
                        curr_record.qual = None  # For consistency, the quality of the variant comes only from the first caller of the variant
                    # SR and PR by sample (from the first caller finding the variant: callers are in user order)
                    curr_record.format.insert(0, "SRSRC")
                    curr_record.format.insert(0, "PRSRC")
                    curr_record.format.insert(0, "SR")
                    curr_record.format.insert(0, "PR")
                    for spl_name, spl_data in curr_record.samples.items():
                        spl_data["SR"] = support_by_spl[spl_name]["SR"]
                        spl_data["PR"] = support_by_spl[spl_name]["PR"]
                        spl_data["SRSRC"] = [support_by_spl[spl_name]["SR"]]
                        spl_data["PRSRC"] = [support_by_spl[spl_name]["PR"]]
            else:  # Update previous fusion
                for prev_rec, curr_rec in zip(prev_records, records):
                    prev_rec.info["SRC"].append(curr_caller)
                    prev_rec.info["IDSRC"].append(curr_rec.id)
                    # FILTERS
                    new_filters = set(curr_rec.filter) - {
                        "Imprecise"
                    }  # Imprecise is take into accout only for the first caller to keep consistency with CIPOS
                    prev_rec.filter = list(set(prev_rec.filter) or new_filters)
                    # FORMAT
                    prev_rec.format.extend(curr_rec.format)
                    # INFO
                    del (curr_rec.info["MATEID"])
                    if "IMPRECISE" in curr_rec.info:
                        del (
                            curr_rec.info["IMPRECISE"]
                        )  # Imprecise is take into accout only for the first caller to keep consistency with CIPOS
                    prev_rec.info.update(curr_rec.info)
                    # SAMPLES
                    for spl_name, spl_data in prev_rec.samples.items():
                        spl_data.update(curr_rec.samples[spl_name])
                        spl_data["SRSRC"].append(
                            support_by_spl[spl_name]["SR"])
                        spl_data["PRSRC"].append(
                            support_by_spl[spl_name]["PR"])
        # Add new fusions in whole_fusions
        for curr in new_fusions:
            if curr.reference.name not in whole_fusions:
                whole_fusions[curr.reference.name] = RegionList()
            whole_fusions[curr.reference.name].append(curr)
        # Sort fusions by first breakend
        for chrom, fusions in whole_fusions.items():
            whole_fusions[chrom] = RegionList(
                sorted(fusions, key=lambda x: (x.start, x.end)))
    # Flatten fusions
    returned_fusions = []
    for chr, fusions in whole_fusions.items():
        for fusion_region in fusions:
            returned_fusions.append(
                (fusion_region.annot["first"], fusion_region.annot["second"]))
    return returned_fusions