Esempio n. 1
0
def getPrimersByChr(in_regions):
    """
    @summary: Returns the list of primers by chromosome.
    @param in_regions: [str] Path to the amplicons design with their primers (format: BED). The zone of interest is defined by thickStart and thickEnd.
    @return: [dict] By chromosome an instance of RegionList containing the primers. Each primer has an location annotation: upstream or downstream (this information is strand + based).
    """
    primers_by_chr = dict()
    with BEDIO(in_regions) as FH_in:
        for record in FH_in:
            if record.chrom not in primers_by_chr:
                primers_by_chr[record.chrom] = RegionList()
            if record.thickStart is None or record.thickEnd is None:
                raise Exception(
                    'The BED file "' + in_regions +
                    '" does not contains thickStart and thickEnd for all the amplicons.'
                )
            upstream_primer = Region(record.start, record.thickStart - 1,
                                     record.strand, record.reference, None,
                                     {"location": "upstream"})
            primers_by_chr[record.chrom].append(upstream_primer)
            downstream_primer = Region(record.thickEnd + 1, record.end,
                                       record.strand, record.reference, None,
                                       {"location": "downstream"})
            primers_by_chr[record.chrom].append(downstream_primer)
    return primers_by_chr
Esempio n. 2
0
    def testShallowFromAlignment(self):
        """
        art_chr1:
                10        20        30        40        50        60        70        80        90       100       110       120
        123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678
        ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT
                   *******.************************************************** ******************************************.*********
                TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA
                                                                              ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG
                 ------------------------------------------------------------------------------------------------------------------

        art_chr2:
                10        20        30        40        50        60        70        80        90       100       110       120
        123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678
        ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT               **********************         **********************
               ********************************************        ***************************************************
               AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAA        CATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC
                 -------------------------------------------------------------------------------------------

        art_chr3:
                10        20        30        40        50        60        70
        123456789|123456789|123456789|123456789|123456789|123456789|123456789|12
        ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT

                 -------------------------------
        """
        with open(self.tmp_sam, "w") as writer:
            writer.write("""@SQ	SN:art_chr1	LN:128
@SQ	SN:art_chr2	LN:128
@SQ	SN:art_chr3	LN:72
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fasta reads.fasta
read_1	0	art_chr1	12	60	3S58M	*	0	0	TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA	*	NM:i:1	MD:Z:7G50	AS:i:53	XS:i:0
read_2	0	art_chr1	71	60	52M	*	0	0	ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG	*	NM:i:1	MD:Z:42G9	AS:i:47	XS:i:0
read_3	0	art_chr2	8	60	44M8D51M	*	0	0	AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAACATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC	*	NM:i:8	MD:Z:44^TACTAAAT51	AS:i:81	XS:i:0
""")
        samToBam(self.tmp_sam, self.tmp_bam)
        pysam.index(self.tmp_bam)

        class FakeLogger:
            def info(self, msg):
                pass

        selected_regions = RegionList([
            Region(10, 123, None, "art_chr1"),
            Region(10, 100, None, "art_chr2"),
            Region(10, 40, None, "art_chr3"),
        ])

        expected = [
            "art_chr1:10-11",
            "art_chr1:70-70",
            "art_chr1:123-123",
            "art_chr3:10-40",
        ]
        observed = [
            str(elt) for elt in shallowFromAlignment(
                self.tmp_bam, selected_regions, "reads", 1, FakeLogger())
        ]
        self.assertEqual(sorted(expected), sorted(observed))
Esempio n. 3
0
 def __init__(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, itemRgb=None, blockCount=None, blockSizes=None, blockStarts=None):
     """
     """
     Region.__init__(self, start, end, strand, chrom, name)
     self.score = score
     self.thickStart = thickStart
     self.thickEnd = thickEnd
     self.itemRgb = itemRgb
     self.blockCount = blockCount
     self.blockSizes = blockSizes
     self.blockStarts = blockStarts
Esempio n. 4
0
    def __init__(self,
                 chrom=None,
                 start=None,
                 end=None,
                 name=None,
                 score=None,
                 strand=None,
                 thickStart=None,
                 thickEnd=None,
                 itemRgb=None,
                 blockCount=None,
                 blockSizes=None,
                 blockStarts=None):
        """
        Build and return an instance of BEDRecord.

        :param chrom: The name of the chromosome on which the annotation has been defined.
        :type chrom: str
        :param start: The start position on the reference. This position is 1-based and ascending (start <= end).
        :type start: int
        :param end: The end position on the reference. This position is 1-based and ascending (start <= end). [Default: start]
        :type end: int
        :param name: The name of the annotation.
        :type name: str
        :param score: A score between 0 and 1000
        :type score: int
        :param strand: The strand of the annotation ("+" or "-").
        :type strand: str
        :param thickStart: The starting position at which the feature is drawn thickly (for example, the start codon in gene displays). This position is 1-based and ascending (start <= end).
        :type thickStart: int
        :param thickEnd: The ending position at which the feature is drawn thickly (for example the stop codon in gene displays). This position is 1-based and ascending (start <= end).
        :type thickEnd: int
        :param itemRgb: An RGB value of the form R,G,B (e.g. 255,0,0).
        :type itemRgb: list
        :param blockCount: The number of blocks (exons) in the BED line.
        :type blockCount: int
        :param blockSizes: list of the block sizes
        :type blockSizes: list
        :param blockStarts: list of block starts. All of the blockStart positions should be calculated relative to chromStart.
        :type blockStarts: list
        :return: The new instance.
        :rtype: BEDRecord
        """
        Region.__init__(self, start, end, strand, chrom, name)
        self.score = score
        self.thickStart = thickStart
        self.thickEnd = thickEnd
        self.itemRgb = itemRgb
        self.blockCount = blockCount
        self.blockSizes = blockSizes
        self.blockStarts = blockStarts
Esempio n. 5
0
 def testSplittedByRef(self):
     reg_list = RegionList([
         Region(10, 30, "-", "chr1", "region1"),
         Region(40, 70, "-", "chr1", "region2"),
         Region(80, 100, "-", "chr2", "region3")
     ])
     reg_by_chr = splittedByRef(reg_list)
     expected = ["chr1:region1", "chr1:region2", "chr2:region3"]
     observed = []
     for chrom, regions in sorted(reg_by_chr.items()):
         named_regions = []
         for curr_region in regions:
             named_regions.append("{}:{}".format(chrom, curr_region.name))
         observed.extend(named_regions)
     self.assertEqual(expected, observed)
Esempio n. 6
0
def getTargets(in_aln, in_targets=None):
    """
    Return the list of targeted regions.

    :param in_aln: Path to the alignment file (format: SAM/BAM).
    :type in_aln: str
    :param in_targets: Path to the targeted regions (format: BED). They must not contains any overlap.
    :type in_targets: str
    :return: List of targeted regions.
    :rtype: anacore.region.RegionList
    """
    selected_regions = RegionList()
    if in_targets is None:
        with pysam.AlignmentFile(in_aln, "rb") as FH_bam:
            for ref_info in FH_bam.header["SQ"]:
                selected_regions.append(
                    Region(1, ref_info["LN"], "+", ref_info["SN"], ref_info["SN"])
                )
    else:
        selected_regions = getAreas(in_targets)
        # Check lack of overlap
        selected_regions = sorted(selected_regions, key=lambda x: (x.reference.name, x.start, x.end))
        prev_region = selected_regions[0]
        for curr_region in selected_regions[1:]:
            if curr_region.reference.name == prev_region.reference.name:
                if prev_region.end >= curr_region.start:
                    raise Exception("The regions {} and {} contains an overlap.".format(prev_region, curr_region))
            prev_region = curr_region
    return selected_regions
Esempio n. 7
0
def addToShallow(curr_chr, curr_pos, prev_opened, shallows):
    """
    Add current position in current shallow frame if they are consecutive else create a shallow area with previous frame and open new shallow frame with current pos.

    :param curr_chr: Name of the current region.
    :type curr_chr: str
    :param curr_pos: The current position with low DP (0-based).
    :type curr_pos: int
    :param prev_opened: The previous shallow frame ({"start": x, "end": y}).
    :type prev_opened: dict
    :param shallows: The list of shallows areas
    :type shallows: anacore.region.RegionList
    """
    if prev_opened["start"] is None:
        prev_opened["start"] = curr_pos
        prev_opened["end"] = curr_pos
    else:
        if prev_opened["end"] == curr_pos - 1:
            prev_opened["end"] = curr_pos
        else:
            shallows.append(
                Region(prev_opened["start"] + 1, prev_opened["end"] + 1, "+", curr_chr)
            )
            prev_opened["start"] = curr_pos
            prev_opened["end"] = curr_pos
Esempio n. 8
0
def getFragmentRegion(chrom_seq, target, target_seq, start_pos, fragment_len):
    fragment_seq = ""
    end_pos = None
    if start_pos > target.end:  # Fragment starts after target
        end_pos = start_pos + fragment_len - 1
        fragment_seq = chrom_seq[
            start_pos - 1:end_pos]  # Position is 1-based indexes are 0-based
    elif start_pos + fragment_len - 1 < target.start:  # Fragment ends before target
        end_pos = start_pos + fragment_len - 1
        fragment_seq = chrom_seq[
            start_pos - 1:end_pos]  # Position is 1-based indexes are 0-based
    else:  # Fragment overlap target
        start_idx_on_target = start_pos - target.start
        # Before target
        if start_pos < target.start:  # Fragment starts before target
            start_idx_on_target = 0
            add_start_pos = start_pos
            add_end_pos = target.start - 1
            fragment_seq = chrom_seq[
                add_start_pos -
                1:add_end_pos]  # Position is 1-based indexes are 0-based
        # On target
        fragment_seq_on_target, end_idx, missing_len = getPartialFragment(
            target_seq, start_idx_on_target, fragment_len - len(fragment_seq))
        fragment_seq += fragment_seq_on_target
        end_pos = target.start + end_idx + missing_len
        # After target
        if missing_len > 0:  # Fragment ends after target
            add_start_pos = target.end + 1
            add_end_pos = add_start_pos + missing_len - 1
            fragment_seq += chrom_seq[
                add_start_pos -
                1:add_end_pos]  # Position is 1-based indexes are 0-based
    return Region(start_pos, end_pos, None, target.reference, None,
                  {"seq": fragment_seq})
Esempio n. 9
0
    def __init__(self,
                 seq_id=None,
                 source=None,
                 type=None,
                 start=None,
                 end=None,
                 score=None,
                 strand=None,
                 phase=None,
                 attributes=None):
        """
        Build and return an instance of GFF3Record.

        :param seq_id: The ID of the landmark used to establish the coordinate system for the current feature.
        :type seq_id: str.
        :param source: The source is a free text qualifier intended to describe the algorithm or operating procedure that generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name, such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type creating a new composite type that is a subclass of the type in the type column.
        :type source: str.
        :param type: The type of the feature (previously called the "method"). This is constrained to be either: (a) a term from the "lite" sequence ontology, SOFA; or (b) a SOFA accession number. The latter alternative is distinguished using the syntax SO:000000.
        :type type: str.
        :param start: The start of the feature, in 1-based integer coordinates, relative to the landmark given in seq_id. Start is always less than or equal to end.
        :type start: int.
        :param end: The end of the feature, in 1-based integer coordinates, relative to the landmark given in seq_id. Start is always less than or equal to end.
        :type end: int.
        :param score: The score of the feature. As in earlier versions of the format, the semantics of the score are ill-defined. It is strongly recommended that E-values be used for sequence similarity features, and that P-values be used for ab initio gene prediction features.
        :type score: float.
        :param strand: The strand of the instance ("+" or "-").
        :type strand: str.
        :param phase: For features of type "CDS", the phase indicates where the feature begins with reference to the reading frame. The phase is one of the integers 0, 1, or 2, indicating the number of bases that should be removed from the beginning of this feature to reach the first base of the next codon. In other words, a phase of "0" indicates that the next codon begins at the first base of the region described by the current line, a phase of "1" indicates that the next codon begins at the second base of this region, and a phase of "2" indicates that the codon begins at the third base of this region. This is NOT to be confused with the frame, which is simply start modulo 3. For forward strand features, phase is counted from the start field. For reverse strand features, phase is counted from the end field.
        :type phase: int.
        :param attributes: The annotations of the feature.
        :type attributes: dict.
        :return: The new instance.
        :rtype: GFF3Record
        """
        name = None
        cleaned_attributes = attributes
        if attributes is not None and "Name" in attributes:
            name = attributes["Name"]
            cleaned_attributes = copy.deepcopy(attributes)
            del (cleaned_attributes["Name"])
        Region.__init__(self, start, end, strand, seq_id, name,
                        cleaned_attributes)
        self.source = source
        self.type = type
        self.score = score
        self.phase = phase
Esempio n. 10
0
def shallowFromAlignment(aln_path, selected_regions, depth_mode, min_depth, log):
    """
    Return the list of shallow regions from the alignment file.

    :param aln_path: Path to the alignment file (format: SAM/BAM).
    :type aln_path: str
    :param selected_regions: Targeted regions. They must not contains any overlap between them.
    :type selected_regions: anacore.region.RegionList
    :param depth_mode: How count the depth: by reads (each reads is added independently) or by fragment (the R1 and R2 coming from the same pair are counted only once).
    :type depth_mode: str
    :param min_depth: All the locations with a depth under this value are reported in shallows areas.
    :type min_depth: int
    :param log: Logger of the script.
    :type log: logging.Logger
    :return: List of shallow regions.
    :rtype: anacore.region.RegionList
    """
    shallow = RegionList()
    nb_selected_regions = len(selected_regions)
    idx_in_part = 1
    with pysam.AlignmentFile(aln_path, "rb") as FH_bam:
        for idx_region, region in enumerate(selected_regions):
            if idx_in_part > nb_selected_regions / 10:
                idx_in_part = 0
                log.info("Processed regions {}/{}.".format(idx_region + 1, nb_selected_regions))
            idx_in_part += 1
            prev_opened = {"start": None, "end": None}
            curr_checked = region.start - 1
            for pileupcolumn in FH_bam.pileup(region.reference.name, region.start - 1, region.end - 1, max_depth=100000000):
                if pileupcolumn.reference_pos + 1 >= region.start and pileupcolumn.reference_pos + 1 <= region.end:
                    # Missing positions
                    while curr_checked < pileupcolumn.reference_pos:
                        addToShallow(region.reference, curr_checked, prev_opened, shallow)
                        curr_checked += 1
                    # Current position
                    curr_reads_depth = 0
                    curr_frag = set()
                    for pileupread in pileupcolumn.pileups:
                        if pileupcolumn.reference_pos + 1 < region.start or pileupcolumn.reference_pos + 1 > region.end:
                            raise Exception("The reference position {}:{} is out of target {}.".format(region.reference.name, pileupcolumn.reference_pos, region))
                        if not pileupread.alignment.is_secondary and not pileupread.alignment.is_duplicate and not pileupread.is_refskip:
                            curr_reads_depth += 1
                            curr_frag.add(pileupread.alignment.query_name)
                    curr_depth = curr_reads_depth
                    if depth_mode == "fragment":
                        curr_depth = len(curr_frag)
                    if min_depth > curr_depth:
                        addToShallow(region.reference, pileupcolumn.reference_pos, prev_opened, shallow)
                    curr_checked = pileupcolumn.reference_pos + 1
            # Missing positions
            while curr_checked < region.end:
                addToShallow(region.reference, curr_checked, prev_opened, shallow)
                curr_checked += 1
            if prev_opened["start"] is not None:
                shallow.append(
                    Region(prev_opened["start"] + 1, prev_opened["end"] + 1, "+", region.reference)
                )
    return shallow
Esempio n. 11
0
 def testGetPosOnRef(self):
     # Forward strand
     f_stranded_region = Region(9, 15, "+")
     self.assertEqual(f_stranded_region.getPosOnRef(1), 9)  # First nt
     self.assertEqual(f_stranded_region.getPosOnRef(3), 11)
     self.assertEqual(f_stranded_region.getPosOnRef(7), 15)  # Last nt
     # self.assertRaises(ValueError, f_stranded_region.getPosOnRef(8))  # Out of region
     # Reverse strand
     r_stranded_region = Region(9, 15, "-")
     self.assertEqual(r_stranded_region.getPosOnRef(1), 15)  # First nt
     self.assertEqual(r_stranded_region.getPosOnRef(3), 13)
     self.assertEqual(r_stranded_region.getPosOnRef(7), 9)  # Last nt
Esempio n. 12
0
def mergedOverlapped(regions,
                     padding=0,
                     trace=False):  ###################################### pb
    """
    """
    sorted_regions = sorted(regions, key=lambda x: (x.start, x.end))
    deleted_idx = []
    prev_region = Region(-1, -1)
    # Extend regions
    for idx, curr_region in enumerate(sorted_regions):
        curr_start = max(1, curr_region.start - padding)
        prev_end = curr_region.end + padding
        if curr_start <= prev_end:  # Overlap between regions
            if trace:
                if "merge_traceback" not in prev_region.annot:
                    prev_region.annot["merge_traceback"] = [
                        Region(prev_region.start, prev_region.end,
                               prev_region.strand, prev_region.reference,
                               prev_region.name)
                    ]
                prev_region.annot["merge_traceback"].append(
                    Region(curr_region.start, curr_region.end,
                           curr_region.strand, curr_region.reference,
                           curr_region.name))
            prev_region.end = max(
                curr_region.end,
                prev_region.end)  # Max to manage included regions
            deleted_idx.append(idx)
        else:
            prev_region = curr_region
    # Delete useless regions
    for idx in sorted(deleted_idx, reverse=True):
        del (sorted_regions[idx])
Esempio n. 13
0
    def _parseLine(self):
        """
        Return a structured record from the TopHatFusionIO current line.

        :return: The record.
        :rtype: dict
        """
        fusion, trash_1, contig_a, contig_b, depth_a, depth_b, mate_distances = [elt.strip() for elt in self.current_line.split('@')]
        chrom, break_a, break_b, orientation, nb_splitted_reads, nb_splitted_pairs, nb_pairs_splitted_reads, nb_contradict, base_cover_left, base_cover_right, trash_1 = [field.strip() for field in fusion.split("\t")]
        chrom_a, chrom_b = chrom.split("-")
        break_a = int(break_a)
        break_b = int(break_b)
        strand_a, strand_b = [("+" if elt == "f" else "-") for elt in orientation]
        return {
            "partner_a": Region(break_a, break_a, strand_a, chrom_a),
            "partner_b": Region(break_b, break_b, strand_b, chrom_b),
            "nb_splitted_reads": int(nb_splitted_reads),
            "nb_splitted_pairs": int(nb_splitted_pairs),
            "nb_pairs_splitted_reads": int(nb_pairs_splitted_reads),
            "nb_contradict": int(nb_contradict),
            "base_cover_left": int(base_cover_left),
            "base_cover_right": int(base_cover_right)
        }
Esempio n. 14
0
def getVariantRegion(variant):
    """
    @summary: Returns region object corresponding to the variant.
    @param variant: [VCFRecord] The variant.
    @return: [Region] The region object corresponding to the variant.
    @warnings: This function can only be used on variant with only one alternative allele.
    """
    std_variant = deepcopy(variant)
    std_variant.normalizeSingleAllele()
    return Region(
        std_variant.pos,
        std_variant.pos + len(std_variant.ref) -
        1,  # Works also with nomalized insertion
        None,
        std_variant.chrom)
Esempio n. 15
0
 def testConsolidate(self):
     reg_list = RegionList([
         Region(5, 9, "-", "chr1", "region1"),
         Region(10, 30, "-", "chr1", "region2"),
         Region(30, 40, "-", "chr1", "region3"),
         Region(35, 39, "-", "chr1", "region4"),
         Region(40, 70, "-", "chr1", "region5"),
         Region(71, 90, "-", "chr1", "region6"),
         Region(92, 100, "-", "chr1", "region7"),
         Region(100, 100, "+", "chr1", "region8"),
         Region(80, 100, "-", "chr2", "region9")
     ])
     # Merge overlapping
     consolidated_reg = consolidated(reg_list, False)
     expected = ["chr1:5-9[-]", "chr1:10-70[-]", "chr1:71-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"]
     observed = [curr.getCoordinatesStr() for curr in consolidated_reg]
     self.assertEqual(expected, observed)
     # Merge overlapping and contiguous
     consolidated_reg = consolidated(reg_list, True)
     expected = ["chr1:5-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"]
     observed = [curr.getCoordinatesStr() for curr in consolidated_reg]
     self.assertEqual(expected, observed)
Esempio n. 16
0
def variantsRegionFromVCF(vcf_path, min_count=1, symbol="GENE", hgvsc="CDS", hgvsp="AA", count="CNT"):
    """
    Return the region object corresponding to the known variants in a VCF.

    :param vcf_path: Path to the variants file (format: VCF).
    :type vcf_path: str
    :param min_count: Minimum number of samples where the variant is known in the databases to use its information.
    :type min_count: int
    :param symbol: Tag used in VCF.info to store the symbol of the gene.
    :type symbol: str
    :param hgvsc: Tag used in VCF.info to store the HGVSc.
    :type hgvsc: str
    :param hgvsp: Tag used in VCF.info to store the HGVSp.
    :type hgvsp: str
    :param count: Tag used in VCF.info to store the number of database's samples with this variant.
    :type count: str
    :return: List of variants regions.
    :rtype: anacore.region.RegionList
    """
    variants_region = None
    with VCFIO(vcf_path) as FH_in:
        variants_region = [
            Region(
                record.pos,
                record.pos + len(record.ref),
                None,
                record.chrom,
                record.id,
                {
                    "id": record.id,
                    "gene": ("" if symbol not in record.info else record.info[symbol]),
                    "HGVSp": ("" if hgvsp not in record.info else record.info[hgvsp]),
                    "HGVSc": ("" if hgvsc not in record.info else record.info[hgvsc]),
                    "count": (None if count not in record.info else int(record.info[count]))
                }
            ) for record in FH_in if (symbol not in record.info or "_ENST" not in record.info[symbol]) and (count not in record.info or int(record.info[count]) >= min_count)
        ]
    return RegionList(variants_region)
Esempio n. 17
0
def exonsPos(record, genes_by_chr):
    """
    Return by positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries.

    :param record: Breakdend record with CIPOS.
    :type record: anacore.vcf.VCFRecord
    :param genes_by_chr: By chromosomes a tree where nodes are genes, transcripts, protein, exons and CDS.
    :type genes_by_chr: dict
    :return: By positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries.
    :rtype: dict
    """
    record_strand = getStrand(record)
    exons_pos = {}
    start, end = getBNDInterval(record)
    interval_region = Region(start, end, None, record.chrom, record.getName())
    if record.chrom in genes_by_chr:
        overlapped_genes = genes_by_chr[record.chrom].getOverlapped(
            interval_region)
        for curr_gene in overlapped_genes:
            overlapped_transcripts = curr_gene.children.getOverlapped(
                interval_region)
            for curr_transcript in overlapped_transcripts:
                for subregion in curr_transcript.children.getOverlapped(
                        interval_region):
                    if record_strand == subregion.strand and issubclass(
                            subregion.__class__, Exon):
                        if interval_region.start <= subregion.start and interval_region.end >= subregion.start:  # Breakend match to exon start
                            if subregion.start not in exons_pos:
                                exons_pos[subregion.start] = 1
                            else:
                                exons_pos[subregion.start] += 1
                        if interval_region.start <= subregion.end and interval_region.end >= subregion.end:
                            if subregion.end not in exons_pos:
                                exons_pos[subregion.end] = 1
                            else:
                                exons_pos[subregion.end] += 1
    return exons_pos
Esempio n. 18
0
    def testGetTranscriptsAnnot_withoutUTR_oneExon(self):
        exon_1 = Exon(91, 150, "+", "chr1", "exon_2")
        cds_1 = CDS(91, 150, "+", "chr1", "cds_1")
        gene_1 = Gene(10, 350, None, "chr1", "gene_1", {"id": "g_1"})
        transcrit_1 = Transcript(None,
                                 None,
                                 None,
                                 "chr1",
                                 "transcrit_1", {"id": "tr_1"},
                                 parent=gene_1,
                                 children=[exon_1])
        protein_1 = Protein(None,
                            None,
                            None,
                            "chr1",
                            "protein_2",
                            children=[cds_1],
                            transcript=transcrit_1)
        queries = [
            Region(80, 160, None, "chr1", "query_1",
                   {"desc": "starts before exon_1 ; ends after exon_1."}),
            Region(
                91, 150, None, "chr1", "query_2",
                {"desc": "starts at start of exon_1 ; ends at end of exon_1."
                 }),
            Region(100, 110, None, "chr1", "query_3",
                   {"desc": "starts in exon_1 ; ends in exon_1."}),
            Region(80, 100, None, "chr1", "query_4",
                   {"desc": "starts before exon_1 ; ends in exon_1."}),
            Region(110, 200, None, "chr1", "query_5",
                   {"desc": "starts in exon_1 ; ends after exon_1."}),
        ]

        # Expected forward 1 exon
        expected = {
            "query_1": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_2": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_3": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 4,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 7
            },
            "query_4": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 4
            },
            "query_5": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 7,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
        }
        for query_name, query_res in expected.items():
            for key, val in {
                    "SYMBOL": "gene_1",
                    "Gene": "g_1",
                    "Feature": "tr_1",
                    "Feature_type": "Transcript",
                    "STRAND": "1"
            }.items():
                query_res[key] = val
        # Apply forward strand
        for exon in transcrit_1.children:
            exon.strand = "+"
        for cds in protein_1.children:
            cds.strand = "+"
        transcrit_1.sortChildren()
        protein_1.sortChildren()
        # Asert
        for curr_query in queries:
            annotations = getTranscriptsAnnot(curr_query, [transcrit_1])
            self.assertEqual([expected[curr_query.name]], annotations)

        # Expected reverse 1 exon
        expected = {
            "query_1": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_2": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_3": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 14,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 17
            },
            "query_4": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 17,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_5": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 14
            },
        }
        for query_name, query_res in expected.items():
            for key, val in {
                    "SYMBOL": "gene_1",
                    "Gene": "g_1",
                    "Feature": "tr_1",
                    "Feature_type": "Transcript",
                    "STRAND": "-1"
            }.items():
                query_res[key] = val
        # Apply reverse strand
        for exon in transcrit_1.children:
            exon.strand = "-"
        for cds in protein_1.children:
            cds.strand = "-"
        transcrit_1.sortChildren()
        protein_1.sortChildren()
        # Asert
        for curr_query in queries:
            annotations = getTranscriptsAnnot(curr_query, [transcrit_1])
            self.assertEqual([expected[curr_query.name]], annotations)
Esempio n. 19
0
 def testStrandedContains(self):
     # Forward strand
     f_region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         f_region.strandedContains(Region(9, 12, "+", "chr1")),
         True
     )
     self.assertEqual(
         f_region.strandedContains(Region(9, 12, "-", "chr1")),
         False
     )
     # Reverse strand
     r_region = Region(9, 15, "-", "chr1")
     self.assertEqual(
         r_region.strandedContains(Region(9, 12, "+", "chr1")),
         False
     )
     self.assertEqual(
         r_region.strandedContains(Region(9, 12, "-", "chr1")),
         True
     )
Esempio n. 20
0
    )
    args = parser.parse_args()

    # Get transcripts
    gene_by_tr = getGeneByRefTr(args.input_reference_tr)
    selected_transcripts = getTranscriptAnnot(args.input_annotation,
                                              gene_by_tr)
    tr_by_chr = splittedByRef(selected_transcripts)
    # Write renamed regions
    out_nb_col = BEDIO.getMaxNbCol(args.input_regions)
    if out_nb_col == 3:
        out_nb_col = 4
    with BEDIO(args.input_regions) as FH_regions:
        with BEDIO(args.output_regions, "w", out_nb_col) as FH_out:
            for record_idx, record in enumerate(FH_regions):
                target = Region(record.start, record.end, record.strand,
                                record.chrom)
                if args.is_thick_based and record.thickStart is not None and record.thickEnd is not None:
                    target.start = record.thickStart
                    target.end = record.thickEnd
                overlapped_tr = list()
                if record.chrom in tr_by_chr:
                    overlapped_tr = tr_by_chr[record.chrom].getOverlapped(
                        target)
                if len(overlapped_tr) > 1:
                    warnings.warn(
                        "The region {} overlaps several transcripts ({}).".
                        format(target, [str(tr) for tr in overlapped_tr]))
                if len(overlapped_tr) >= 1:
                    overlapped_exons = overlapped_tr[0].children.getOverlapped(
                        target)
                    features = list()
Esempio n. 21
0
 def testGetMinDist(self):
     region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         region.getMinDist(Region(14, 18, "+", "chr1")),
         0
     )
     self.assertEqual(
         region.getMinDist(Region(16, 18, "+", "chr1")),
         1
     )
     self.assertEqual(
         region.getMinDist(Region(1, 5, "+", "chr1")),
         4
     )
     with self.assertRaises(Exception):
         region.getMinDist(Region(1, 5, "+", "chr2"))
Esempio n. 22
0
 def testHasOverlap(self):
     region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         region.hasOverlap(Region(9, 9, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(15, 15, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(12, 13, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(9, 15, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(8, 14, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(10, 16, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(8, 16, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(8, 8, "+", "chr1")),
         False
     )
     self.assertEqual(
         region.hasOverlap(Region(16, 16, "+", "chr1")),
         False
     )
     self.assertEqual(
         region.hasOverlap(Region(12, 13, "+", "chr2")),
         False
     )
Esempio n. 23
0
 def testGetPosOnRegion(self):
     # Forward strand
     f_stranded_region = Region(9, 15, "+", "chr1")
     self.assertEqual(f_stranded_region.getPosOnRegion(9), 1)  # First nt
     self.assertEqual(f_stranded_region.getPosOnRegion(11), 3)
     self.assertEqual(f_stranded_region.getPosOnRegion(15), 7)  # Last nt
     with self.assertRaises(ValueError):
         f_stranded_region.getPosOnRegion(8)  # Out of region
     with self.assertRaises(ValueError):
         f_stranded_region.getPosOnRegion(16)  # Out of region
     # Reverse strand
     r_stranded_region = Region(9, 15, "-", "chr1")
     self.assertEqual(r_stranded_region.getPosOnRegion(15), 1)  # First nt
     self.assertEqual(r_stranded_region.getPosOnRegion(13), 3)
     self.assertEqual(r_stranded_region.getPosOnRegion(9), 7)  # Last nt
     with self.assertRaises(ValueError):
         r_stranded_region.getPosOnRegion(8)  # Out of region
     with self.assertRaises(ValueError):
         r_stranded_region.getPosOnRegion(16)  # Out of region
Esempio n. 24
0
def isReadthrough(up, down, annotation_field, genes, rt_max_dist, annCmpName, regCmpName):
    """
    Return True if the two breakends can be a readthrough.

    :param up: The breakend of the first shard in fusion.
    :type up: anacore.vcf.VCFRecord
    :param down: The breakend of the second shard in fusion.
    :type down: anacore.vcf.VCFRecord
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :param genes: The genes regions by chr.
    :type genes: AnnotGetter
    :param rt_max_dist: Maximum distance to evaluate if the fusion is a readthrough.
    :type rt_max_dist: int
    :param annCmpName: Callable used to return gene unique name from one VCF annotation.
    :type annCmpName: callable(annot)
    :param regCmpName: Callable used to return gene unique name from a gene region.
    :type regCmpName: callable(anacore.genomicRegion.Gene)
    :return: True if the two breakends can be a readthrough.
    :rtype: boolean
    """
    is_readthrough = False
    if up.chrom == down.chrom:
        up_strand = getStrand(up, True)
        down_strand = getStrand(down, False)
        if (up_strand == "+" and down_strand == "+") or (up_strand == "-" and down_strand == "-"):  # Readthrough are +/+ or -/-
            first = up
            second = down
            if first.pos > second.pos:
                first = down
                second = up
            first_start, first_end = getBNDInterval(first)
            second_start, second_end = getBNDInterval(second)
            interval_start = min(first_start, second_start)
            interval_end = max(first_end, second_end) + 1
            if interval_end - interval_start <= rt_max_dist:
                first_bp_gene = {annCmpName(annot) for annot in first.info[annotation_field]}
                second_bp_gene = {annCmpName(annot) for annot in second.info[annotation_field]}
                full_overlapping_gene = first_bp_gene & second_bp_gene
                only_first_bp_gene = first_bp_gene - second_bp_gene
                only_second_bp_gene = second_bp_gene - first_bp_gene
                if len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0:
                    strand_by_gene = {annCmpName(annot): annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field]}
                    only_first_bp_gene = {gene for gene in only_first_bp_gene if strand_by_gene[gene] == up_strand}
                    only_second_bp_gene = {gene for gene in only_second_bp_gene if strand_by_gene[gene] == up_strand}
                    possible_on_strand = len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0
                    if possible_on_strand:
                        interval_region = Region(interval_start, interval_end, up_strand, first.chrom)
                        overlapped_genes = genes.getChr(first.chrom).getOverlapped(interval_region)
                        overlapped_genes = RegionList([gene for gene in overlapped_genes if regCmpName(gene) not in full_overlapping_gene and gene.strand == up_strand])
                        overlapped_genes_by_id = {regCmpName(gene): gene for gene in overlapped_genes}
                        contradict_readthrough = False
                        for start_gene_id in only_first_bp_gene:
                            start_gene = overlapped_genes_by_id[start_gene_id]
                            for end_gene_id in only_second_bp_gene:
                                end_gene = overlapped_genes_by_id[end_gene_id]
                                for interval_gene in overlapped_genes:
                                    if regCmpName(interval_gene) != regCmpName(start_gene) and \
                                       regCmpName(interval_gene) != regCmpName(end_gene):
                                        if not interval_gene.hasOverlap(start_gene) and not interval_gene.hasOverlap(end_gene):
                                            contradict_readthrough = True
                        is_readthrough = not contradict_readthrough
    return is_readthrough
Esempio n. 25
0
 def testLength(self):
     self.assertEqual(Region(9, 15, None).length(), 7)
     self.assertEqual(Region(9, 15, "+").length(), 7)
     self.assertEqual(Region(9, 15, "-").length(), 7)
     self.assertEqual(Region(9, None, "-").length(), 1)
     self.assertEqual(Region(9, 9, "-").length(), 1)
Esempio n. 26
0
 def testContains(self):
     container_region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         container_region.contains(Region(9, 9, "+", "chr1")),
         True
     )
     self.assertEqual(
         container_region.contains(Region(15, 15, "+", "chr1")),
         True
     )
     self.assertEqual(
         container_region.contains(Region(12, 13, "+", "chr1")),
         True
     )
     self.assertEqual(
         container_region.contains(Region(9, 15, "+", "chr1")),
         True
     )
     self.assertEqual(
         container_region.contains(Region(8, 14, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(10, 16, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(8, 16, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(8, 8, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(16, 16, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(12, 13, "+", "chr2")),
         False
     )
Esempio n. 27
0
 def testIterOverlapped_3(self):
     """Case where a subject is included in another."""
     # Init test data
     sbjct_1 = Region(7, 10, "+", "chr1", "sbjct_1")
     sbjct_2 = Region(14, 20, "+", "chr1", "sbjct_2")
     sbjct_3 = Region(16, 18, "+", "chr1", "sbjct_3")
     sbjct_4 = Region(24, 29, "+", "chr1", "sbjct_4")
     subjects = RegionList([sbjct_1, sbjct_2, sbjct_3, sbjct_4])
     queries_info = [
         {"query": Region(11, 11, "+", "chr1", "query_l1_01"), "overlapped": []},
         {"query": Region(12, 12, "+", "chr1", "query_l1_02"), "overlapped": []},
         {"query": Region(13, 13, "+", "chr1", "query_l1_03"), "overlapped": []},
         {"query": Region(14, 14, "+", "chr1", "query_l1_04"), "overlapped": [sbjct_2]},
         {"query": Region(15, 15, "+", "chr1", "query_l1_05"), "overlapped": [sbjct_2]},
         {"query": Region(16, 16, "+", "chr1", "query_l1_06"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 17, "+", "chr1", "query_l1_07"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 18, "+", "chr1", "query_l1_08"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 19, "+", "chr1", "query_l1_09"), "overlapped": [sbjct_2]},
         {"query": Region(20, 20, "+", "chr1", "query_l1_10"), "overlapped": [sbjct_2]},
         {"query": Region(21, 21, "+", "chr1", "query_l1_11"), "overlapped": []},
         {"query": Region(22, 22, "+", "chr1", "query_l1_12"), "overlapped": []},
         {"query": Region(11, 13, "+", "chr1", "query_l3_01"), "overlapped": []},
         {"query": Region(12, 14, "+", "chr1", "query_l3_02"), "overlapped": [sbjct_2]},
         {"query": Region(13, 15, "+", "chr1", "query_l3_03"), "overlapped": [sbjct_2]},
         {"query": Region(14, 16, "+", "chr1", "query_l3_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 17, "+", "chr1", "query_l3_05"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(16, 18, "+", "chr1", "query_l3_06"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 19, "+", "chr1", "query_l3_07"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 20, "+", "chr1", "query_l3_08"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 21, "+", "chr1", "query_l3_09"), "overlapped": [sbjct_2]},
         {"query": Region(20, 22, "+", "chr1", "query_l3_10"), "overlapped": [sbjct_2]},
         {"query": Region(21, 23, "+", "chr1", "query_l3_11"), "overlapped": []},
         {"query": Region(13, 17, "+", "chr1", "query_l5_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 19, "+", "chr1", "query_l5_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 21, "+", "chr1", "query_l5_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 22, "+", "chr1", "query_l5_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 23, "+", "chr1", "query_l5_05"), "overlapped": [sbjct_2]},
         {"query": Region(20, 24, "+", "chr1", "query_l5_06"), "overlapped": [sbjct_2, sbjct_4]},
         {"query": Region(13, 18, "+", "chr1", "query_l6_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 19, "+", "chr1", "query_l6_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 20, "+", "chr1", "query_l6_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(16, 21, "+", "chr1", "query_l6_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 22, "+", "chr1", "query_l6_05"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 23, "+", "chr1", "query_l6_06"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 24, "+", "chr1", "query_l6_07"), "overlapped": [sbjct_2, sbjct_4]},
         {"query": Region(13, 19, "+", "chr1", "query_l7_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 20, "+", "chr1", "query_l7_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 21, "+", "chr1", "query_l7_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(16, 22, "+", "chr1", "query_l7_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 23, "+", "chr1", "query_l7_05"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 24, "+", "chr1", "query_l7_06"), "overlapped": [sbjct_2, sbjct_3, sbjct_4]},
         {"query": Region(19, 24, "+", "chr1", "query_l7_07"), "overlapped": [sbjct_2, sbjct_4]},
         {"query": Region(13, 20, "+", "chr1", "query_l8_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 21, "+", "chr1", "query_l8_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 22, "+", "chr1", "query_l8_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(13, 21, "+", "chr1", "query_l9_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 22, "+", "chr1", "query_l9_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(13, 22, "+", "chr1", "query_l10_01"), "overlapped": [sbjct_2, sbjct_3]}
     ]
     queries_info = sorted(queries_info, key=lambda x: (x["query"].start, x["query"].end))
     # Independant evaluation
     for curr_eval in queries_info:
         obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped([curr_eval["query"]], subjects)]
         self.assertEqual(obs_overlapped, [curr_eval["overlapped"]])
     # Grouped evaluation
     queries = [curr_info["query"] for curr_info in queries_info]
     expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info]
     obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, subjects)]
     self.assertEqual(obs_overlapped, expec_overlapped)
     # Grouped evaluation and inclusion between subjects starts the list of subjects
     shifted_subjects = subjects[1:]
     queries = [curr_info["query"] for curr_info in queries_info]
     expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info]
     obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, shifted_subjects)]
     self.assertEqual(obs_overlapped, expec_overlapped)
     # Grouped evaluation and inclusion between subjects ends the list of subjects
     poped_subjects = subjects[:-1]
     queries = [curr_info["query"] for curr_info in queries_info]
     expec_overlapped = []
     for curr_info in queries_info:
         expec_overlapped.append([elt for elt in curr_info["overlapped"] if elt != sbjct_4])
     obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, poped_subjects)]
     self.assertEqual(obs_overlapped, expec_overlapped)
Esempio n. 28
0
 def testHasStrandedOverlap(self):
     # Forward strand
     f_region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         f_region.hasStrandedOverlap(Region(5, 12, "+", "chr1")),
         True
     )
     self.assertEqual(
         f_region.hasStrandedOverlap(Region(5, 12, "-", "chr1")),
         False
     )
     # Reverse strand
     r_region = Region(9, 15, "-", "chr1")
     self.assertEqual(
         r_region.hasStrandedOverlap(Region(5, 12, "+", "chr1")),
         False
     )
     self.assertEqual(
         r_region.hasStrandedOverlap(Region(5, 12, "-", "chr1")),
         True
     )
def groupBNDByFusions(bnd_by_id, annotation_field):
    """
    Return by chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second).

    :param bnd_by_id: Breakend by ID coming from one fusion caller.
    :type bnd_by_id: dict
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :return: By chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second).
    :rtype: dict
    """
    caller_fusions = dict()
    processed_fusions = set()
    fusion_by_name = {}
    for id, record in bnd_by_id.items():
        for alt_idx, alt in enumerate(record.alt):
            alt_first_bnd = record
            first_new_id = alt_first_bnd.id
            if len(record.alt) > 1:
                first_new_id += "_" + str(
                    alt_idx)  # Record must be splitted for each mate
                alt_first_bnd = getAlleleRecord(record, alt_idx)
                alt_first_bnd.info["MATEID"] = [record.info["MATEID"][alt_idx]]
            mate_id = alt_first_bnd.info["MATEID"][0]
            mate_record = bnd_by_id[mate_id]
            alt_second_bnd = mate_record
            second_new_id = alt_second_bnd.id
            if len(mate_record.alt) > 1:
                first_idx = mate_record.info["MATEID"].index(alt_first_bnd.id)
                second_new_id += "_" + first_idx  # Record must be splitted for each mate
                alt_second_bnd = getAlleleRecord(mate_record, first_idx)
                alt_second_bnd.info["MATEID"] = [
                    mate_record.info["MATEID"][first_idx]
                ]
            fusion_id = " @@ ".join(
                sorted([alt_first_bnd.id, alt_second_bnd.id]))
            alt_first_bnd.id = first_new_id
            alt_second_bnd.info["MATEID"] = [first_new_id]
            alt_second_bnd.id = second_new_id
            alt_first_bnd.info["MATEID"] = [second_new_id]
            if fusion_id not in processed_fusions:
                processed_fusions.add(fusion_id)
                if "RNA_FIRST" not in alt_first_bnd.info and "RNA_FIRST" not in alt_second_bnd.info:
                    raise Exception(
                        "Tag RNA_FIRST must be present in one of the breakend {} or {}."
                        .format(alt_first_bnd.id, mate_id))
                if "RNA_FIRST" in alt_second_bnd.info:
                    aux = alt_first_bnd
                    alt_first_bnd = alt_second_bnd
                    alt_second_bnd = aux
                interval_first_bnd = getBNDInterval(alt_first_bnd)
                fusion_name = " @@ ".join(
                    sorted([alt_first_bnd.getName(),
                            alt_second_bnd.getName()]))
                if fusion_name not in fusion_by_name:
                    region_first_bnd = Region(interval_first_bnd[0],
                                              interval_first_bnd[1],
                                              reference=alt_first_bnd.chrom,
                                              annot={
                                                  "first": alt_first_bnd,
                                                  "second": alt_second_bnd
                                              })
                    if alt_first_bnd.chrom not in caller_fusions:
                        caller_fusions[alt_first_bnd.chrom] = RegionList()
                    caller_fusions[alt_first_bnd.chrom].append(
                        region_first_bnd)
                    fusion_by_name[fusion_name] = region_first_bnd
                else:  # Caller contains several entries for the same pair of breakends (same fusion but several anotations)
                    fusion_by_name[fusion_name].annot["first"].info[
                        annotation_field] += alt_first_bnd.info[
                            annotation_field]
                    fusion_by_name[fusion_name].annot["second"].info[
                        annotation_field] += alt_second_bnd.info[
                            annotation_field]
    return caller_fusions
Esempio n. 30
0
 def testFromStr(self):
     observed = Region.fromStr("12:1534187-1534287")
     expected = Region(1534187, 1534287, None, "12")
     self.assertEqual(str(observed), str(expected))