Beispiel #1
0
def getTargets(in_aln, in_targets=None):
    """
    Return the list of targeted regions.

    :param in_aln: Path to the alignment file (format: SAM/BAM).
    :type in_aln: str
    :param in_targets: Path to the targeted regions (format: BED). They must not contains any overlap.
    :type in_targets: str
    :return: List of targeted regions.
    :rtype: anacore.region.RegionList
    """
    selected_regions = RegionList()
    if in_targets is None:
        with pysam.AlignmentFile(in_aln, "rb") as FH_bam:
            for ref_info in FH_bam.header["SQ"]:
                selected_regions.append(
                    Region(1, ref_info["LN"], "+", ref_info["SN"], ref_info["SN"])
                )
    else:
        selected_regions = getAreas(in_targets)
        # Check lack of overlap
        selected_regions = sorted(selected_regions, key=lambda x: (x.reference.name, x.start, x.end))
        prev_region = selected_regions[0]
        for curr_region in selected_regions[1:]:
            if curr_region.reference.name == prev_region.reference.name:
                if prev_region.end >= curr_region.start:
                    raise Exception("The regions {} and {} contains an overlap.".format(prev_region, curr_region))
            prev_region = curr_region
    return selected_regions
Beispiel #2
0
def shallowFromAlignment(aln_path, selected_regions, depth_mode, min_depth, log):
    """
    Return the list of shallow regions from the alignment file.

    :param aln_path: Path to the alignment file (format: SAM/BAM).
    :type aln_path: str
    :param selected_regions: Targeted regions. They must not contains any overlap between them.
    :type selected_regions: anacore.region.RegionList
    :param depth_mode: How count the depth: by reads (each reads is added independently) or by fragment (the R1 and R2 coming from the same pair are counted only once).
    :type depth_mode: str
    :param min_depth: All the locations with a depth under this value are reported in shallows areas.
    :type min_depth: int
    :param log: Logger of the script.
    :type log: logging.Logger
    :return: List of shallow regions.
    :rtype: anacore.region.RegionList
    """
    shallow = RegionList()
    nb_selected_regions = len(selected_regions)
    idx_in_part = 1
    with pysam.AlignmentFile(aln_path, "rb") as FH_bam:
        for idx_region, region in enumerate(selected_regions):
            if idx_in_part > nb_selected_regions / 10:
                idx_in_part = 0
                log.info("Processed regions {}/{}.".format(idx_region + 1, nb_selected_regions))
            idx_in_part += 1
            prev_opened = {"start": None, "end": None}
            curr_checked = region.start - 1
            for pileupcolumn in FH_bam.pileup(region.reference.name, region.start - 1, region.end - 1, max_depth=100000000):
                if pileupcolumn.reference_pos + 1 >= region.start and pileupcolumn.reference_pos + 1 <= region.end:
                    # Missing positions
                    while curr_checked < pileupcolumn.reference_pos:
                        addToShallow(region.reference, curr_checked, prev_opened, shallow)
                        curr_checked += 1
                    # Current position
                    curr_reads_depth = 0
                    curr_frag = set()
                    for pileupread in pileupcolumn.pileups:
                        if pileupcolumn.reference_pos + 1 < region.start or pileupcolumn.reference_pos + 1 > region.end:
                            raise Exception("The reference position {}:{} is out of target {}.".format(region.reference.name, pileupcolumn.reference_pos, region))
                        if not pileupread.alignment.is_secondary and not pileupread.alignment.is_duplicate and not pileupread.is_refskip:
                            curr_reads_depth += 1
                            curr_frag.add(pileupread.alignment.query_name)
                    curr_depth = curr_reads_depth
                    if depth_mode == "fragment":
                        curr_depth = len(curr_frag)
                    if min_depth > curr_depth:
                        addToShallow(region.reference, pileupcolumn.reference_pos, prev_opened, shallow)
                    curr_checked = pileupcolumn.reference_pos + 1
            # Missing positions
            while curr_checked < region.end:
                addToShallow(region.reference, curr_checked, prev_opened, shallow)
                curr_checked += 1
            if prev_opened["start"] is not None:
                shallow.append(
                    Region(prev_opened["start"] + 1, prev_opened["end"] + 1, "+", region.reference)
                )
    return shallow
Beispiel #3
0
def getAreas(in_bed):
    """
    @summary: Returns the list of areas from a BED file.
    @param input_areas: [str] The path to the areas description (format: BED).
    @returns: [RegionList] The list of areas.
    """
    areas = RegionList()
    with BEDIO(in_bed) as FH_panel:
        areas = RegionList(FH_panel.read())
    return areas
Beispiel #4
0
def getAreas(in_bed):
    """
    Return the list of areas from a BED file.

    :param in_bed: The path to the areas description (format: BED).
    :type in_bed: str
    :return: The list of areas.
    :rtype: region.RegionList
    """
    areas = RegionList()
    with BEDIO(in_bed) as FH_panel:
        areas = RegionList(FH_panel.read())
    return areas
Beispiel #5
0
def getPrimersByChr(in_regions):
    """
    @summary: Returns the list of primers by chromosome.
    @param in_regions: [str] Path to the amplicons design with their primers (format: BED). The zone of interest is defined by thickStart and thickEnd.
    @return: [dict] By chromosome an instance of RegionList containing the primers. Each primer has an location annotation: upstream or downstream (this information is strand + based).
    """
    primers_by_chr = dict()
    with BEDIO(in_regions) as FH_in:
        for record in FH_in:
            if record.chrom not in primers_by_chr:
                primers_by_chr[record.chrom] = RegionList()
            if record.thickStart is None or record.thickEnd is None:
                raise Exception(
                    'The BED file "' + in_regions +
                    '" does not contains thickStart and thickEnd for all the amplicons.'
                )
            upstream_primer = Region(record.start, record.thickStart - 1,
                                     record.strand, record.reference, None,
                                     {"location": "upstream"})
            primers_by_chr[record.chrom].append(upstream_primer)
            downstream_primer = Region(record.thickEnd + 1, record.end,
                                       record.strand, record.reference, None,
                                       {"location": "downstream"})
            primers_by_chr[record.chrom].append(downstream_primer)
    return primers_by_chr
Beispiel #6
0
    def testShallowFromAlignment(self):
        """
        art_chr1:
                10        20        30        40        50        60        70        80        90       100       110       120
        123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678
        ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT
                   *******.************************************************** ******************************************.*********
                TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA
                                                                              ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG
                 ------------------------------------------------------------------------------------------------------------------

        art_chr2:
                10        20        30        40        50        60        70        80        90       100       110       120
        123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678
        ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT               **********************         **********************
               ********************************************        ***************************************************
               AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAA        CATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC
                 -------------------------------------------------------------------------------------------

        art_chr3:
                10        20        30        40        50        60        70
        123456789|123456789|123456789|123456789|123456789|123456789|123456789|12
        ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT

                 -------------------------------
        """
        with open(self.tmp_sam, "w") as writer:
            writer.write("""@SQ	SN:art_chr1	LN:128
@SQ	SN:art_chr2	LN:128
@SQ	SN:art_chr3	LN:72
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fasta reads.fasta
read_1	0	art_chr1	12	60	3S58M	*	0	0	TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA	*	NM:i:1	MD:Z:7G50	AS:i:53	XS:i:0
read_2	0	art_chr1	71	60	52M	*	0	0	ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG	*	NM:i:1	MD:Z:42G9	AS:i:47	XS:i:0
read_3	0	art_chr2	8	60	44M8D51M	*	0	0	AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAACATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC	*	NM:i:8	MD:Z:44^TACTAAAT51	AS:i:81	XS:i:0
""")
        samToBam(self.tmp_sam, self.tmp_bam)
        pysam.index(self.tmp_bam)

        class FakeLogger:
            def info(self, msg):
                pass

        selected_regions = RegionList([
            Region(10, 123, None, "art_chr1"),
            Region(10, 100, None, "art_chr2"),
            Region(10, 40, None, "art_chr3"),
        ])

        expected = [
            "art_chr1:10-11",
            "art_chr1:70-70",
            "art_chr1:123-123",
            "art_chr3:10-40",
        ]
        observed = [
            str(elt) for elt in shallowFromAlignment(
                self.tmp_bam, selected_regions, "reads", 1, FakeLogger())
        ]
        self.assertEqual(sorted(expected), sorted(observed))
Beispiel #7
0
def getAreasByChr(in_bed):
    """
    @summary: Returns from a BED file the list of areas by chromosome.
    @param input_areas: [str] The path to the areas description (format: BED).
    @returns: [dict] The list of areas by chromosome (each list is an instance of Regionlist).
    """
    areas_by_chr = dict()
    for curr_area in getAreas(in_bed):
        chrom = curr_area.reference.name
        if chrom not in areas_by_chr:
            areas_by_chr[chrom] = RegionList()
        areas_by_chr[chrom].append(curr_area)
    return areas_by_chr
Beispiel #8
0
def getSortedAreasByChr(in_bed):
    """
    Return by chromosome the list of sorted areas from a BED file.

    :param in_bed: The path to the areas description (format: BED).
    :type in_bed: str
    :return: The list of sorted areas by chromosome (each list is an instance of region.Regionlist).
    :rtype: dict
    """
    areas_by_chr = {}
    for chrom, areas in getAreasByChr(in_bed).items():
        areas_by_chr[chrom] = RegionList(
            sorted(areas, key=lambda x: (x.start, x.end)))
    return areas_by_chr
Beispiel #9
0
 def testSplittedByRef(self):
     reg_list = RegionList([
         Region(10, 30, "-", "chr1", "region1"),
         Region(40, 70, "-", "chr1", "region2"),
         Region(80, 100, "-", "chr2", "region3")
     ])
     reg_by_chr = splittedByRef(reg_list)
     expected = ["chr1:region1", "chr1:region2", "chr2:region3"]
     observed = []
     for chrom, regions in sorted(reg_by_chr.items()):
         named_regions = []
         for curr_region in regions:
             named_regions.append("{}:{}".format(chrom, curr_region.name))
         observed.extend(named_regions)
     self.assertEqual(expected, observed)
Beispiel #10
0
    def proteins(self, proteins):
        """
        Change proteins linked with transcript. Before the add of the new proteins the old are unlinked (prot.transcript is set to None).

        :param proteins: The new linked proteins.
        :type proteins: list
        """
        # Remove previous proteins
        for idx_prot, prot in enumerate(self.proteins):
            prot.transcript = None  # Remove link in protein
        self._proteins = RegionList()  # Remove link in transcript
        # Add new proteins
        if proteins is not None:
            for curr_prot in proteins:
                curr_prot.transcript = self
Beispiel #11
0
def getAreasByChr(in_bed):
    """
    Return by chromosome the list of areas from a BED file.

    :param in_bed: The path to the areas description (format: BED).
    :type in_bed: str
    :return: The list of areas by chromosome (each list is an instance of region.Regionlist).
    :rtype: dict
    """
    areas_by_chr = dict()
    for curr_area in getAreas(in_bed):
        chrom = curr_area.reference.name
        if chrom not in areas_by_chr:
            areas_by_chr[chrom] = RegionList()
        areas_by_chr[chrom].append(curr_area)
    return areas_by_chr
Beispiel #12
0
def filteredByOverlap(targets_by_chr, selected_genes):
    """
    Return targeted areas overlapping selected genes.

    :param targets_by_chr: RegionList of targets by chromosomes.
    :type targets_by_chr: dict
    :param selected_genes: Translocated genes. Each element is a Gene object from genomic annotations.
    :type selected_genes: list
    :return: Targeted areas overlapping selected genes.
    :rtype: dict
    """
    # Genes by chromosome
    genes_by_chr = dict()
    for gene in selected_genes:
        chrom = gene.reference.name
        if chrom in selected_genes:
            genes_by_chr[chrom].append(gene)
        else:
            genes_by_chr[chrom] = RegionList([gene])
    # Find overlaps between targets and selected genes
    trimmed_targets_by_chr = dict()
    for chrom, genes in genes_by_chr.items():
        overlaps = list()
        for gene, targets in iterOverlapped(genes_by_chr[chrom],
                                            targets_by_chr[chrom]):
            for curr in targets:
                overlaps.append(
                    [max(gene.start, curr.start),
                     min(gene.end, curr.end)])
        consolidated_overlaps = list()
        if len(overlaps) > 0:
            overlaps = sorted(overlaps, key=lambda x: (x[0], x[1]))
            consolidated_overlaps = [overlaps[0]]
            prev = overlaps[0]
            for curr in overlaps[1:]:
                if curr[0] > prev[1]:
                    consolidated_overlaps.append(curr)
                    prev = curr
                else:
                    prev[1] = max(curr[1], prev[1])
        trimmed_targets_by_chr[chrom] = consolidated_overlaps
    return trimmed_targets_by_chr
Beispiel #13
0
 def testConsolidate(self):
     reg_list = RegionList([
         Region(5, 9, "-", "chr1", "region1"),
         Region(10, 30, "-", "chr1", "region2"),
         Region(30, 40, "-", "chr1", "region3"),
         Region(35, 39, "-", "chr1", "region4"),
         Region(40, 70, "-", "chr1", "region5"),
         Region(71, 90, "-", "chr1", "region6"),
         Region(92, 100, "-", "chr1", "region7"),
         Region(100, 100, "+", "chr1", "region8"),
         Region(80, 100, "-", "chr2", "region9")
     ])
     # Merge overlapping
     consolidated_reg = consolidated(reg_list, False)
     expected = ["chr1:5-9[-]", "chr1:10-70[-]", "chr1:71-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"]
     observed = [curr.getCoordinatesStr() for curr in consolidated_reg]
     self.assertEqual(expected, observed)
     # Merge overlapping and contiguous
     consolidated_reg = consolidated(reg_list, True)
     expected = ["chr1:5-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"]
     observed = [curr.getCoordinatesStr() for curr in consolidated_reg]
     self.assertEqual(expected, observed)
Beispiel #14
0
def variantsRegionFromVCF(vcf_path, min_count=1, symbol="GENE", hgvsc="CDS", hgvsp="AA", count="CNT"):
    """
    Return the region object corresponding to the known variants in a VCF.

    :param vcf_path: Path to the variants file (format: VCF).
    :type vcf_path: str
    :param min_count: Minimum number of samples where the variant is known in the databases to use its information.
    :type min_count: int
    :param symbol: Tag used in VCF.info to store the symbol of the gene.
    :type symbol: str
    :param hgvsc: Tag used in VCF.info to store the HGVSc.
    :type hgvsc: str
    :param hgvsp: Tag used in VCF.info to store the HGVSp.
    :type hgvsp: str
    :param count: Tag used in VCF.info to store the number of database's samples with this variant.
    :type count: str
    :return: List of variants regions.
    :rtype: anacore.region.RegionList
    """
    variants_region = None
    with VCFIO(vcf_path) as FH_in:
        variants_region = [
            Region(
                record.pos,
                record.pos + len(record.ref),
                None,
                record.chrom,
                record.id,
                {
                    "id": record.id,
                    "gene": ("" if symbol not in record.info else record.info[symbol]),
                    "HGVSp": ("" if hgvsp not in record.info else record.info[hgvsp]),
                    "HGVSc": ("" if hgvsc not in record.info else record.info[hgvsc]),
                    "count": (None if count not in record.info else int(record.info[count]))
                }
            ) for record in FH_in if (symbol not in record.info or "_ENST" not in record.info[symbol]) and (count not in record.info or int(record.info[count]) >= min_count)
        ]
    return RegionList(variants_region)
Beispiel #15
0
    def __init__(self,
                 start=None,
                 end=None,
                 strand=None,
                 reference=None,
                 name=None,
                 annot=None,
                 parent=None,
                 children=None,
                 proteins=None):
        """
        Build and return an instance of Transcript.

        :param start: The start position on the reference. This position is 1-based and ascending (start <= end).
        :type start: int
        :param end: The end position on the reference. This position is 1-based and ascending (start <= end). [Default: start]
        :type end: int
        :param strand: The strand of the instance ("+" or "-").
        :type strand: str
        :param reference: The region object or the region name of the reference.
        :type reference: region.Region | str
        :param name: The name of the region.
        :type name: str
        :param annot: The annotations of the region.
        :type annot: dict
        :param parent: The gene.
        :type parent: region.RegionTree
        :param children: The list of exons.
        :type children: region.RegionList
        :param proteins: The list of proteins produced on transcript. It can exists several proteins on transcript: operon and readthrough.
        :type proteins: region.RegionList
        :return: The new instance.
        :rtype: genomicRegion.Transcript
        """
        RegionTree.__init__(self, start, end, strand, reference, name, annot,
                            parent, children)
        self._proteins = RegionList()
        self.proteins = proteins
Beispiel #16
0
def getTranscriptAnnot(in_annot, gene_by_tr):
    """
    Get genomic model (genes, transcripts and exons) for the selected transcripts.

    :param in_annot: Path to the genomic annotations (format: GFF3).
    :type in_annot: str
    :param gene_by_tr: Gene by selected transcripts.
    :type gene_by_tr: dict
    :return: The list of selected transcripts.
    :rtype: anacore.region.RegionList
    """
    tr_by_id = dict()
    with GFF3IO(in_annot) as FH_annot:
        for record in FH_annot:
            if record.type == "mRNA" and "transcript_id" in record.annot:
                tr_id = record.annot["transcript_id"]
                tr_id = tr_id.split(".")[0]  # Remove transcript version
                if tr_id in gene_by_tr:  # Transcript is in panel
                    if tr_id not in tr_by_id:
                        tr_by_id[tr_id] = Transcript(record.start, record.end,
                                                     record.strand,
                                                     record.seq_id, tr_id, {},
                                                     gene_by_tr[tr_id])
            if record.type == "exon" and "transcript_id" in record.annot:
                tr_id = record.annot["transcript_id"]
                tr_id = tr_id.split(".")[0]  # Remove transcript version
                if tr_id in gene_by_tr:  # Transcript is in panel
                    # Store the exon
                    tr_by_id[tr_id].addChild(
                        Exon(record.start, record.end, record.strand,
                             record.seq_id))
    if len(gene_by_tr) != len(tr_by_id):
        raise Exception(
            "The following transcripts are missing in {}: {}".format(
                args.input_annotation,
                set(gene_by_tr.keys()).difference(set(tr_by_id.keys()))))
    return RegionList(tr_by_id.values())
Beispiel #17
0
    def getCDSFromTranscript(self):
        """
        Return CDS of the protein from the transcript and his exons. This function is used when CDS are not defined in the protein but exons and protein start and end are defined.

        :return: The list of CDS of the protein in protein strand order.
        :rtype: region.Regionlist
        """
        # Check information completion
        if self.transcript is None:
            raise Exception(
                "A link with a transcript is required to return CDS for {}.".
                format(self))
        if self.start is None or self.end is None:
            raise Exception(
                "Start and end for {} are required to return CDS from transcript {}."
                .format(self, self.transcript))
        # Exons to CDS
        exons = sorted(self.transcript.children,
                       key=lambda x: (x.start, x.end))
        nb_exons = len(exons)
        idx_exon = 0
        curr_exon = exons[idx_exon]
        while self.start > curr_exon.end:
            idx_exon += 1
            curr_exon = exons[idx_exon]
        cds = RegionList()
        while curr_exon is not None and self.end >= curr_exon.start:
            cds_start = max(self.start, curr_exon.start)
            cds_end = min(self.end, curr_exon.end)
            cds.append(CDS(cds_start, cds_end, self.strand, self.reference))
            idx_exon += 1
            curr_exon = None
            if idx_exon < nb_exons:
                curr_exon = exons[idx_exon]
        # Sort by strand order
        if self.strand == "-":
            cds = RegionList(
                sorted(cds, key=lambda x: (x.end, x.start), reverse=True))
        # Return
        return cds
Beispiel #18
0
def isReadthrough(up, down, annotation_field, genes, rt_max_dist, annCmpName, regCmpName):
    """
    Return True if the two breakends can be a readthrough.

    :param up: The breakend of the first shard in fusion.
    :type up: anacore.vcf.VCFRecord
    :param down: The breakend of the second shard in fusion.
    :type down: anacore.vcf.VCFRecord
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :param genes: The genes regions by chr.
    :type genes: AnnotGetter
    :param rt_max_dist: Maximum distance to evaluate if the fusion is a readthrough.
    :type rt_max_dist: int
    :param annCmpName: Callable used to return gene unique name from one VCF annotation.
    :type annCmpName: callable(annot)
    :param regCmpName: Callable used to return gene unique name from a gene region.
    :type regCmpName: callable(anacore.genomicRegion.Gene)
    :return: True if the two breakends can be a readthrough.
    :rtype: boolean
    """
    is_readthrough = False
    if up.chrom == down.chrom:
        up_strand = getStrand(up, True)
        down_strand = getStrand(down, False)
        if (up_strand == "+" and down_strand == "+") or (up_strand == "-" and down_strand == "-"):  # Readthrough are +/+ or -/-
            first = up
            second = down
            if first.pos > second.pos:
                first = down
                second = up
            first_start, first_end = getBNDInterval(first)
            second_start, second_end = getBNDInterval(second)
            interval_start = min(first_start, second_start)
            interval_end = max(first_end, second_end) + 1
            if interval_end - interval_start <= rt_max_dist:
                first_bp_gene = {annCmpName(annot) for annot in first.info[annotation_field]}
                second_bp_gene = {annCmpName(annot) for annot in second.info[annotation_field]}
                full_overlapping_gene = first_bp_gene & second_bp_gene
                only_first_bp_gene = first_bp_gene - second_bp_gene
                only_second_bp_gene = second_bp_gene - first_bp_gene
                if len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0:
                    strand_by_gene = {annCmpName(annot): annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field]}
                    only_first_bp_gene = {gene for gene in only_first_bp_gene if strand_by_gene[gene] == up_strand}
                    only_second_bp_gene = {gene for gene in only_second_bp_gene if strand_by_gene[gene] == up_strand}
                    possible_on_strand = len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0
                    if possible_on_strand:
                        interval_region = Region(interval_start, interval_end, up_strand, first.chrom)
                        overlapped_genes = genes.getChr(first.chrom).getOverlapped(interval_region)
                        overlapped_genes = RegionList([gene for gene in overlapped_genes if regCmpName(gene) not in full_overlapping_gene and gene.strand == up_strand])
                        overlapped_genes_by_id = {regCmpName(gene): gene for gene in overlapped_genes}
                        contradict_readthrough = False
                        for start_gene_id in only_first_bp_gene:
                            start_gene = overlapped_genes_by_id[start_gene_id]
                            for end_gene_id in only_second_bp_gene:
                                end_gene = overlapped_genes_by_id[end_gene_id]
                                for interval_gene in overlapped_genes:
                                    if regCmpName(interval_gene) != regCmpName(start_gene) and \
                                       regCmpName(interval_gene) != regCmpName(end_gene):
                                        if not interval_gene.hasOverlap(start_gene) and not interval_gene.hasOverlap(end_gene):
                                            contradict_readthrough = True
                        is_readthrough = not contradict_readthrough
    return is_readthrough
def groupBNDByFusions(bnd_by_id, annotation_field):
    """
    Return by chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second).

    :param bnd_by_id: Breakend by ID coming from one fusion caller.
    :type bnd_by_id: dict
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :return: By chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second).
    :rtype: dict
    """
    caller_fusions = dict()
    processed_fusions = set()
    fusion_by_name = {}
    for id, record in bnd_by_id.items():
        for alt_idx, alt in enumerate(record.alt):
            alt_first_bnd = record
            first_new_id = alt_first_bnd.id
            if len(record.alt) > 1:
                first_new_id += "_" + str(
                    alt_idx)  # Record must be splitted for each mate
                alt_first_bnd = getAlleleRecord(record, alt_idx)
                alt_first_bnd.info["MATEID"] = [record.info["MATEID"][alt_idx]]
            mate_id = alt_first_bnd.info["MATEID"][0]
            mate_record = bnd_by_id[mate_id]
            alt_second_bnd = mate_record
            second_new_id = alt_second_bnd.id
            if len(mate_record.alt) > 1:
                first_idx = mate_record.info["MATEID"].index(alt_first_bnd.id)
                second_new_id += "_" + first_idx  # Record must be splitted for each mate
                alt_second_bnd = getAlleleRecord(mate_record, first_idx)
                alt_second_bnd.info["MATEID"] = [
                    mate_record.info["MATEID"][first_idx]
                ]
            fusion_id = " @@ ".join(
                sorted([alt_first_bnd.id, alt_second_bnd.id]))
            alt_first_bnd.id = first_new_id
            alt_second_bnd.info["MATEID"] = [first_new_id]
            alt_second_bnd.id = second_new_id
            alt_first_bnd.info["MATEID"] = [second_new_id]
            if fusion_id not in processed_fusions:
                processed_fusions.add(fusion_id)
                if "RNA_FIRST" not in alt_first_bnd.info and "RNA_FIRST" not in alt_second_bnd.info:
                    raise Exception(
                        "Tag RNA_FIRST must be present in one of the breakend {} or {}."
                        .format(alt_first_bnd.id, mate_id))
                if "RNA_FIRST" in alt_second_bnd.info:
                    aux = alt_first_bnd
                    alt_first_bnd = alt_second_bnd
                    alt_second_bnd = aux
                interval_first_bnd = getBNDInterval(alt_first_bnd)
                fusion_name = " @@ ".join(
                    sorted([alt_first_bnd.getName(),
                            alt_second_bnd.getName()]))
                if fusion_name not in fusion_by_name:
                    region_first_bnd = Region(interval_first_bnd[0],
                                              interval_first_bnd[1],
                                              reference=alt_first_bnd.chrom,
                                              annot={
                                                  "first": alt_first_bnd,
                                                  "second": alt_second_bnd
                                              })
                    if alt_first_bnd.chrom not in caller_fusions:
                        caller_fusions[alt_first_bnd.chrom] = RegionList()
                    caller_fusions[alt_first_bnd.chrom].append(
                        region_first_bnd)
                    fusion_by_name[fusion_name] = region_first_bnd
                else:  # Caller contains several entries for the same pair of breakends (same fusion but several anotations)
                    fusion_by_name[fusion_name].annot["first"].info[
                        annotation_field] += alt_first_bnd.info[
                            annotation_field]
                    fusion_by_name[fusion_name].annot["second"].info[
                        annotation_field] += alt_second_bnd.info[
                            annotation_field]
    return caller_fusions
def getMergedRecords(inputs_variants, calling_sources, annotation_field,
                     shared_filters):
    """
    Merge VCFRecords coming from several variant callers.

    :param inputs_variants: Pathes to the variants files.
    :type inputs_variants: list
    :param calling_sources: Names of the variants callers (in same order as inputs_variants).
    :type calling_sources: list
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :param shared_filters: Filters tags applying to the variant and independent of caller like filters on annotations. These filters are not renamed to add caller ID as suffix.
    :type shared_filters: set
    :return: Merged VCF records.
    :rtype: list
    """
    whole_fusions = {}  # first bnd region by chromosome
    for idx_in, curr_in in enumerate(inputs_variants):
        curr_caller = calling_sources[idx_in]
        log.info("Process {}".format(curr_caller))
        # breakend by id
        bnd_by_id = loadBNDByID(curr_in)
        # Group by fusion
        curr_caller_fusions = groupBNDByFusions(bnd_by_id, annotation_field)
        # Merge to other callers
        new_fusions = []
        for chrom, query, overlapped in iterOverlappedByRegion(
                curr_caller_fusions, whole_fusions):
            records = (query.annot["first"], query.annot["second"])
            # Extract PR and SR
            support_by_spl = {}
            for spl, data in records[0].samples.items():
                support_by_spl[spl] = {
                    "PR": getCount(data, "PR"),
                    "SR": getCount(data, "SR")
                }
            # Get identical fusion from previous callers
            prev_records = getPrevFusion(records, overlapped, curr_caller)
            # Rename fields
            for curr_record in records:
                renameFields(curr_record, "s{}".format(idx_in), shared_filters)
            # Add to storage
            if prev_records is None:  # Prepare new fusion
                new_fusions.append(query)
                for curr_record in records:
                    # Data source
                    curr_record.info["SRC"] = [curr_caller]
                    curr_record.info["REFSRC"] = curr_caller
                    curr_record.info["IDSRC"] = [curr_record.id]
                    # CIPOS
                    if "s{}_CIPOS".format(idx_in) in curr_record.info:
                        curr_record.info["CIPOS"] = curr_record.info[
                            "s{}_CIPOS".format(idx_in)]
                    # Quality
                    if idx_in != 0:
                        curr_record.qual = None  # For consistency, the quality of the variant comes only from the first caller of the variant
                    # SR and PR by sample (from the first caller finding the variant: callers are in user order)
                    curr_record.format.insert(0, "SRSRC")
                    curr_record.format.insert(0, "PRSRC")
                    curr_record.format.insert(0, "SR")
                    curr_record.format.insert(0, "PR")
                    for spl_name, spl_data in curr_record.samples.items():
                        spl_data["SR"] = support_by_spl[spl_name]["SR"]
                        spl_data["PR"] = support_by_spl[spl_name]["PR"]
                        spl_data["SRSRC"] = [support_by_spl[spl_name]["SR"]]
                        spl_data["PRSRC"] = [support_by_spl[spl_name]["PR"]]
            else:  # Update previous fusion
                for prev_rec, curr_rec in zip(prev_records, records):
                    prev_rec.info["SRC"].append(curr_caller)
                    prev_rec.info["IDSRC"].append(curr_rec.id)
                    # FILTERS
                    new_filters = set(curr_rec.filter) - {
                        "Imprecise"
                    }  # Imprecise is take into accout only for the first caller to keep consistency with CIPOS
                    prev_rec.filter = list(set(prev_rec.filter) or new_filters)
                    # FORMAT
                    prev_rec.format.extend(curr_rec.format)
                    # INFO
                    del (curr_rec.info["MATEID"])
                    if "IMPRECISE" in curr_rec.info:
                        del (
                            curr_rec.info["IMPRECISE"]
                        )  # Imprecise is take into accout only for the first caller to keep consistency with CIPOS
                    prev_rec.info.update(curr_rec.info)
                    # SAMPLES
                    for spl_name, spl_data in prev_rec.samples.items():
                        spl_data.update(curr_rec.samples[spl_name])
                        spl_data["SRSRC"].append(
                            support_by_spl[spl_name]["SR"])
                        spl_data["PRSRC"].append(
                            support_by_spl[spl_name]["PR"])
        # Add new fusions in whole_fusions
        for curr in new_fusions:
            if curr.reference.name not in whole_fusions:
                whole_fusions[curr.reference.name] = RegionList()
            whole_fusions[curr.reference.name].append(curr)
        # Sort fusions by first breakend
        for chrom, fusions in whole_fusions.items():
            whole_fusions[chrom] = RegionList(
                sorted(fusions, key=lambda x: (x.start, x.end)))
    # Flatten fusions
    returned_fusions = []
    for chr, fusions in whole_fusions.items():
        for fusion_region in fusions:
            returned_fusions.append(
                (fusion_region.annot["first"], fusion_region.annot["second"]))
    return returned_fusions
Beispiel #21
0
 def testIterOverlapped_3(self):
     """Case where a subject is included in another."""
     # Init test data
     sbjct_1 = Region(7, 10, "+", "chr1", "sbjct_1")
     sbjct_2 = Region(14, 20, "+", "chr1", "sbjct_2")
     sbjct_3 = Region(16, 18, "+", "chr1", "sbjct_3")
     sbjct_4 = Region(24, 29, "+", "chr1", "sbjct_4")
     subjects = RegionList([sbjct_1, sbjct_2, sbjct_3, sbjct_4])
     queries_info = [
         {"query": Region(11, 11, "+", "chr1", "query_l1_01"), "overlapped": []},
         {"query": Region(12, 12, "+", "chr1", "query_l1_02"), "overlapped": []},
         {"query": Region(13, 13, "+", "chr1", "query_l1_03"), "overlapped": []},
         {"query": Region(14, 14, "+", "chr1", "query_l1_04"), "overlapped": [sbjct_2]},
         {"query": Region(15, 15, "+", "chr1", "query_l1_05"), "overlapped": [sbjct_2]},
         {"query": Region(16, 16, "+", "chr1", "query_l1_06"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 17, "+", "chr1", "query_l1_07"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 18, "+", "chr1", "query_l1_08"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 19, "+", "chr1", "query_l1_09"), "overlapped": [sbjct_2]},
         {"query": Region(20, 20, "+", "chr1", "query_l1_10"), "overlapped": [sbjct_2]},
         {"query": Region(21, 21, "+", "chr1", "query_l1_11"), "overlapped": []},
         {"query": Region(22, 22, "+", "chr1", "query_l1_12"), "overlapped": []},
         {"query": Region(11, 13, "+", "chr1", "query_l3_01"), "overlapped": []},
         {"query": Region(12, 14, "+", "chr1", "query_l3_02"), "overlapped": [sbjct_2]},
         {"query": Region(13, 15, "+", "chr1", "query_l3_03"), "overlapped": [sbjct_2]},
         {"query": Region(14, 16, "+", "chr1", "query_l3_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 17, "+", "chr1", "query_l3_05"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(16, 18, "+", "chr1", "query_l3_06"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 19, "+", "chr1", "query_l3_07"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 20, "+", "chr1", "query_l3_08"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 21, "+", "chr1", "query_l3_09"), "overlapped": [sbjct_2]},
         {"query": Region(20, 22, "+", "chr1", "query_l3_10"), "overlapped": [sbjct_2]},
         {"query": Region(21, 23, "+", "chr1", "query_l3_11"), "overlapped": []},
         {"query": Region(13, 17, "+", "chr1", "query_l5_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 19, "+", "chr1", "query_l5_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 21, "+", "chr1", "query_l5_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 22, "+", "chr1", "query_l5_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 23, "+", "chr1", "query_l5_05"), "overlapped": [sbjct_2]},
         {"query": Region(20, 24, "+", "chr1", "query_l5_06"), "overlapped": [sbjct_2, sbjct_4]},
         {"query": Region(13, 18, "+", "chr1", "query_l6_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 19, "+", "chr1", "query_l6_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 20, "+", "chr1", "query_l6_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(16, 21, "+", "chr1", "query_l6_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 22, "+", "chr1", "query_l6_05"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 23, "+", "chr1", "query_l6_06"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 24, "+", "chr1", "query_l6_07"), "overlapped": [sbjct_2, sbjct_4]},
         {"query": Region(13, 19, "+", "chr1", "query_l7_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 20, "+", "chr1", "query_l7_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 21, "+", "chr1", "query_l7_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(16, 22, "+", "chr1", "query_l7_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 23, "+", "chr1", "query_l7_05"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 24, "+", "chr1", "query_l7_06"), "overlapped": [sbjct_2, sbjct_3, sbjct_4]},
         {"query": Region(19, 24, "+", "chr1", "query_l7_07"), "overlapped": [sbjct_2, sbjct_4]},
         {"query": Region(13, 20, "+", "chr1", "query_l8_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 21, "+", "chr1", "query_l8_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 22, "+", "chr1", "query_l8_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(13, 21, "+", "chr1", "query_l9_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 22, "+", "chr1", "query_l9_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(13, 22, "+", "chr1", "query_l10_01"), "overlapped": [sbjct_2, sbjct_3]}
     ]
     queries_info = sorted(queries_info, key=lambda x: (x["query"].start, x["query"].end))
     # Independant evaluation
     for curr_eval in queries_info:
         obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped([curr_eval["query"]], subjects)]
         self.assertEqual(obs_overlapped, [curr_eval["overlapped"]])
     # Grouped evaluation
     queries = [curr_info["query"] for curr_info in queries_info]
     expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info]
     obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, subjects)]
     self.assertEqual(obs_overlapped, expec_overlapped)
     # Grouped evaluation and inclusion between subjects starts the list of subjects
     shifted_subjects = subjects[1:]
     queries = [curr_info["query"] for curr_info in queries_info]
     expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info]
     obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, shifted_subjects)]
     self.assertEqual(obs_overlapped, expec_overlapped)
     # Grouped evaluation and inclusion between subjects ends the list of subjects
     poped_subjects = subjects[:-1]
     queries = [curr_info["query"] for curr_info in queries_info]
     expec_overlapped = []
     for curr_info in queries_info:
         expec_overlapped.append([elt for elt in curr_info["overlapped"] if elt != sbjct_4])
     obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, poped_subjects)]
     self.assertEqual(obs_overlapped, expec_overlapped)