def getPrimersByChr(in_regions): """ @summary: Returns the list of primers by chromosome. @param in_regions: [str] Path to the amplicons design with their primers (format: BED). The zone of interest is defined by thickStart and thickEnd. @return: [dict] By chromosome an instance of RegionList containing the primers. Each primer has an location annotation: upstream or downstream (this information is strand + based). """ primers_by_chr = dict() with BEDIO(in_regions) as FH_in: for record in FH_in: if record.chrom not in primers_by_chr: primers_by_chr[record.chrom] = RegionList() if record.thickStart is None or record.thickEnd is None: raise Exception( 'The BED file "' + in_regions + '" does not contains thickStart and thickEnd for all the amplicons.' ) upstream_primer = Region(record.start, record.thickStart - 1, record.strand, record.reference, None, {"location": "upstream"}) primers_by_chr[record.chrom].append(upstream_primer) downstream_primer = Region(record.thickEnd + 1, record.end, record.strand, record.reference, None, {"location": "downstream"}) primers_by_chr[record.chrom].append(downstream_primer) return primers_by_chr
def testShallowFromAlignment(self): """ art_chr1: 10 20 30 40 50 60 70 80 90 100 110 120 123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678 ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT *******.************************************************** ******************************************.********* TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG ------------------------------------------------------------------------------------------------------------------ art_chr2: 10 20 30 40 50 60 70 80 90 100 110 120 123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678 ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT ********************** ********************** ******************************************** *************************************************** AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAA CATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC ------------------------------------------------------------------------------------------- art_chr3: 10 20 30 40 50 60 70 123456789|123456789|123456789|123456789|123456789|123456789|123456789|12 ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT ------------------------------- """ with open(self.tmp_sam, "w") as writer: writer.write("""@SQ SN:art_chr1 LN:128 @SQ SN:art_chr2 LN:128 @SQ SN:art_chr3 LN:72 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fasta reads.fasta read_1 0 art_chr1 12 60 3S58M * 0 0 TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA * NM:i:1 MD:Z:7G50 AS:i:53 XS:i:0 read_2 0 art_chr1 71 60 52M * 0 0 ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG * NM:i:1 MD:Z:42G9 AS:i:47 XS:i:0 read_3 0 art_chr2 8 60 44M8D51M * 0 0 AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAACATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC * NM:i:8 MD:Z:44^TACTAAAT51 AS:i:81 XS:i:0 """) samToBam(self.tmp_sam, self.tmp_bam) pysam.index(self.tmp_bam) class FakeLogger: def info(self, msg): pass selected_regions = RegionList([ Region(10, 123, None, "art_chr1"), Region(10, 100, None, "art_chr2"), Region(10, 40, None, "art_chr3"), ]) expected = [ "art_chr1:10-11", "art_chr1:70-70", "art_chr1:123-123", "art_chr3:10-40", ] observed = [ str(elt) for elt in shallowFromAlignment( self.tmp_bam, selected_regions, "reads", 1, FakeLogger()) ] self.assertEqual(sorted(expected), sorted(observed))
def __init__(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, itemRgb=None, blockCount=None, blockSizes=None, blockStarts=None): """ """ Region.__init__(self, start, end, strand, chrom, name) self.score = score self.thickStart = thickStart self.thickEnd = thickEnd self.itemRgb = itemRgb self.blockCount = blockCount self.blockSizes = blockSizes self.blockStarts = blockStarts
def __init__(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, itemRgb=None, blockCount=None, blockSizes=None, blockStarts=None): """ Build and return an instance of BEDRecord. :param chrom: The name of the chromosome on which the annotation has been defined. :type chrom: str :param start: The start position on the reference. This position is 1-based and ascending (start <= end). :type start: int :param end: The end position on the reference. This position is 1-based and ascending (start <= end). [Default: start] :type end: int :param name: The name of the annotation. :type name: str :param score: A score between 0 and 1000 :type score: int :param strand: The strand of the annotation ("+" or "-"). :type strand: str :param thickStart: The starting position at which the feature is drawn thickly (for example, the start codon in gene displays). This position is 1-based and ascending (start <= end). :type thickStart: int :param thickEnd: The ending position at which the feature is drawn thickly (for example the stop codon in gene displays). This position is 1-based and ascending (start <= end). :type thickEnd: int :param itemRgb: An RGB value of the form R,G,B (e.g. 255,0,0). :type itemRgb: list :param blockCount: The number of blocks (exons) in the BED line. :type blockCount: int :param blockSizes: list of the block sizes :type blockSizes: list :param blockStarts: list of block starts. All of the blockStart positions should be calculated relative to chromStart. :type blockStarts: list :return: The new instance. :rtype: BEDRecord """ Region.__init__(self, start, end, strand, chrom, name) self.score = score self.thickStart = thickStart self.thickEnd = thickEnd self.itemRgb = itemRgb self.blockCount = blockCount self.blockSizes = blockSizes self.blockStarts = blockStarts
def testSplittedByRef(self): reg_list = RegionList([ Region(10, 30, "-", "chr1", "region1"), Region(40, 70, "-", "chr1", "region2"), Region(80, 100, "-", "chr2", "region3") ]) reg_by_chr = splittedByRef(reg_list) expected = ["chr1:region1", "chr1:region2", "chr2:region3"] observed = [] for chrom, regions in sorted(reg_by_chr.items()): named_regions = [] for curr_region in regions: named_regions.append("{}:{}".format(chrom, curr_region.name)) observed.extend(named_regions) self.assertEqual(expected, observed)
def getTargets(in_aln, in_targets=None): """ Return the list of targeted regions. :param in_aln: Path to the alignment file (format: SAM/BAM). :type in_aln: str :param in_targets: Path to the targeted regions (format: BED). They must not contains any overlap. :type in_targets: str :return: List of targeted regions. :rtype: anacore.region.RegionList """ selected_regions = RegionList() if in_targets is None: with pysam.AlignmentFile(in_aln, "rb") as FH_bam: for ref_info in FH_bam.header["SQ"]: selected_regions.append( Region(1, ref_info["LN"], "+", ref_info["SN"], ref_info["SN"]) ) else: selected_regions = getAreas(in_targets) # Check lack of overlap selected_regions = sorted(selected_regions, key=lambda x: (x.reference.name, x.start, x.end)) prev_region = selected_regions[0] for curr_region in selected_regions[1:]: if curr_region.reference.name == prev_region.reference.name: if prev_region.end >= curr_region.start: raise Exception("The regions {} and {} contains an overlap.".format(prev_region, curr_region)) prev_region = curr_region return selected_regions
def addToShallow(curr_chr, curr_pos, prev_opened, shallows): """ Add current position in current shallow frame if they are consecutive else create a shallow area with previous frame and open new shallow frame with current pos. :param curr_chr: Name of the current region. :type curr_chr: str :param curr_pos: The current position with low DP (0-based). :type curr_pos: int :param prev_opened: The previous shallow frame ({"start": x, "end": y}). :type prev_opened: dict :param shallows: The list of shallows areas :type shallows: anacore.region.RegionList """ if prev_opened["start"] is None: prev_opened["start"] = curr_pos prev_opened["end"] = curr_pos else: if prev_opened["end"] == curr_pos - 1: prev_opened["end"] = curr_pos else: shallows.append( Region(prev_opened["start"] + 1, prev_opened["end"] + 1, "+", curr_chr) ) prev_opened["start"] = curr_pos prev_opened["end"] = curr_pos
def getFragmentRegion(chrom_seq, target, target_seq, start_pos, fragment_len): fragment_seq = "" end_pos = None if start_pos > target.end: # Fragment starts after target end_pos = start_pos + fragment_len - 1 fragment_seq = chrom_seq[ start_pos - 1:end_pos] # Position is 1-based indexes are 0-based elif start_pos + fragment_len - 1 < target.start: # Fragment ends before target end_pos = start_pos + fragment_len - 1 fragment_seq = chrom_seq[ start_pos - 1:end_pos] # Position is 1-based indexes are 0-based else: # Fragment overlap target start_idx_on_target = start_pos - target.start # Before target if start_pos < target.start: # Fragment starts before target start_idx_on_target = 0 add_start_pos = start_pos add_end_pos = target.start - 1 fragment_seq = chrom_seq[ add_start_pos - 1:add_end_pos] # Position is 1-based indexes are 0-based # On target fragment_seq_on_target, end_idx, missing_len = getPartialFragment( target_seq, start_idx_on_target, fragment_len - len(fragment_seq)) fragment_seq += fragment_seq_on_target end_pos = target.start + end_idx + missing_len # After target if missing_len > 0: # Fragment ends after target add_start_pos = target.end + 1 add_end_pos = add_start_pos + missing_len - 1 fragment_seq += chrom_seq[ add_start_pos - 1:add_end_pos] # Position is 1-based indexes are 0-based return Region(start_pos, end_pos, None, target.reference, None, {"seq": fragment_seq})
def __init__(self, seq_id=None, source=None, type=None, start=None, end=None, score=None, strand=None, phase=None, attributes=None): """ Build and return an instance of GFF3Record. :param seq_id: The ID of the landmark used to establish the coordinate system for the current feature. :type seq_id: str. :param source: The source is a free text qualifier intended to describe the algorithm or operating procedure that generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name, such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type creating a new composite type that is a subclass of the type in the type column. :type source: str. :param type: The type of the feature (previously called the "method"). This is constrained to be either: (a) a term from the "lite" sequence ontology, SOFA; or (b) a SOFA accession number. The latter alternative is distinguished using the syntax SO:000000. :type type: str. :param start: The start of the feature, in 1-based integer coordinates, relative to the landmark given in seq_id. Start is always less than or equal to end. :type start: int. :param end: The end of the feature, in 1-based integer coordinates, relative to the landmark given in seq_id. Start is always less than or equal to end. :type end: int. :param score: The score of the feature. As in earlier versions of the format, the semantics of the score are ill-defined. It is strongly recommended that E-values be used for sequence similarity features, and that P-values be used for ab initio gene prediction features. :type score: float. :param strand: The strand of the instance ("+" or "-"). :type strand: str. :param phase: For features of type "CDS", the phase indicates where the feature begins with reference to the reading frame. The phase is one of the integers 0, 1, or 2, indicating the number of bases that should be removed from the beginning of this feature to reach the first base of the next codon. In other words, a phase of "0" indicates that the next codon begins at the first base of the region described by the current line, a phase of "1" indicates that the next codon begins at the second base of this region, and a phase of "2" indicates that the codon begins at the third base of this region. This is NOT to be confused with the frame, which is simply start modulo 3. For forward strand features, phase is counted from the start field. For reverse strand features, phase is counted from the end field. :type phase: int. :param attributes: The annotations of the feature. :type attributes: dict. :return: The new instance. :rtype: GFF3Record """ name = None cleaned_attributes = attributes if attributes is not None and "Name" in attributes: name = attributes["Name"] cleaned_attributes = copy.deepcopy(attributes) del (cleaned_attributes["Name"]) Region.__init__(self, start, end, strand, seq_id, name, cleaned_attributes) self.source = source self.type = type self.score = score self.phase = phase
def shallowFromAlignment(aln_path, selected_regions, depth_mode, min_depth, log): """ Return the list of shallow regions from the alignment file. :param aln_path: Path to the alignment file (format: SAM/BAM). :type aln_path: str :param selected_regions: Targeted regions. They must not contains any overlap between them. :type selected_regions: anacore.region.RegionList :param depth_mode: How count the depth: by reads (each reads is added independently) or by fragment (the R1 and R2 coming from the same pair are counted only once). :type depth_mode: str :param min_depth: All the locations with a depth under this value are reported in shallows areas. :type min_depth: int :param log: Logger of the script. :type log: logging.Logger :return: List of shallow regions. :rtype: anacore.region.RegionList """ shallow = RegionList() nb_selected_regions = len(selected_regions) idx_in_part = 1 with pysam.AlignmentFile(aln_path, "rb") as FH_bam: for idx_region, region in enumerate(selected_regions): if idx_in_part > nb_selected_regions / 10: idx_in_part = 0 log.info("Processed regions {}/{}.".format(idx_region + 1, nb_selected_regions)) idx_in_part += 1 prev_opened = {"start": None, "end": None} curr_checked = region.start - 1 for pileupcolumn in FH_bam.pileup(region.reference.name, region.start - 1, region.end - 1, max_depth=100000000): if pileupcolumn.reference_pos + 1 >= region.start and pileupcolumn.reference_pos + 1 <= region.end: # Missing positions while curr_checked < pileupcolumn.reference_pos: addToShallow(region.reference, curr_checked, prev_opened, shallow) curr_checked += 1 # Current position curr_reads_depth = 0 curr_frag = set() for pileupread in pileupcolumn.pileups: if pileupcolumn.reference_pos + 1 < region.start or pileupcolumn.reference_pos + 1 > region.end: raise Exception("The reference position {}:{} is out of target {}.".format(region.reference.name, pileupcolumn.reference_pos, region)) if not pileupread.alignment.is_secondary and not pileupread.alignment.is_duplicate and not pileupread.is_refskip: curr_reads_depth += 1 curr_frag.add(pileupread.alignment.query_name) curr_depth = curr_reads_depth if depth_mode == "fragment": curr_depth = len(curr_frag) if min_depth > curr_depth: addToShallow(region.reference, pileupcolumn.reference_pos, prev_opened, shallow) curr_checked = pileupcolumn.reference_pos + 1 # Missing positions while curr_checked < region.end: addToShallow(region.reference, curr_checked, prev_opened, shallow) curr_checked += 1 if prev_opened["start"] is not None: shallow.append( Region(prev_opened["start"] + 1, prev_opened["end"] + 1, "+", region.reference) ) return shallow
def testGetPosOnRef(self): # Forward strand f_stranded_region = Region(9, 15, "+") self.assertEqual(f_stranded_region.getPosOnRef(1), 9) # First nt self.assertEqual(f_stranded_region.getPosOnRef(3), 11) self.assertEqual(f_stranded_region.getPosOnRef(7), 15) # Last nt # self.assertRaises(ValueError, f_stranded_region.getPosOnRef(8)) # Out of region # Reverse strand r_stranded_region = Region(9, 15, "-") self.assertEqual(r_stranded_region.getPosOnRef(1), 15) # First nt self.assertEqual(r_stranded_region.getPosOnRef(3), 13) self.assertEqual(r_stranded_region.getPosOnRef(7), 9) # Last nt
def mergedOverlapped(regions, padding=0, trace=False): ###################################### pb """ """ sorted_regions = sorted(regions, key=lambda x: (x.start, x.end)) deleted_idx = [] prev_region = Region(-1, -1) # Extend regions for idx, curr_region in enumerate(sorted_regions): curr_start = max(1, curr_region.start - padding) prev_end = curr_region.end + padding if curr_start <= prev_end: # Overlap between regions if trace: if "merge_traceback" not in prev_region.annot: prev_region.annot["merge_traceback"] = [ Region(prev_region.start, prev_region.end, prev_region.strand, prev_region.reference, prev_region.name) ] prev_region.annot["merge_traceback"].append( Region(curr_region.start, curr_region.end, curr_region.strand, curr_region.reference, curr_region.name)) prev_region.end = max( curr_region.end, prev_region.end) # Max to manage included regions deleted_idx.append(idx) else: prev_region = curr_region # Delete useless regions for idx in sorted(deleted_idx, reverse=True): del (sorted_regions[idx])
def _parseLine(self): """ Return a structured record from the TopHatFusionIO current line. :return: The record. :rtype: dict """ fusion, trash_1, contig_a, contig_b, depth_a, depth_b, mate_distances = [elt.strip() for elt in self.current_line.split('@')] chrom, break_a, break_b, orientation, nb_splitted_reads, nb_splitted_pairs, nb_pairs_splitted_reads, nb_contradict, base_cover_left, base_cover_right, trash_1 = [field.strip() for field in fusion.split("\t")] chrom_a, chrom_b = chrom.split("-") break_a = int(break_a) break_b = int(break_b) strand_a, strand_b = [("+" if elt == "f" else "-") for elt in orientation] return { "partner_a": Region(break_a, break_a, strand_a, chrom_a), "partner_b": Region(break_b, break_b, strand_b, chrom_b), "nb_splitted_reads": int(nb_splitted_reads), "nb_splitted_pairs": int(nb_splitted_pairs), "nb_pairs_splitted_reads": int(nb_pairs_splitted_reads), "nb_contradict": int(nb_contradict), "base_cover_left": int(base_cover_left), "base_cover_right": int(base_cover_right) }
def getVariantRegion(variant): """ @summary: Returns region object corresponding to the variant. @param variant: [VCFRecord] The variant. @return: [Region] The region object corresponding to the variant. @warnings: This function can only be used on variant with only one alternative allele. """ std_variant = deepcopy(variant) std_variant.normalizeSingleAllele() return Region( std_variant.pos, std_variant.pos + len(std_variant.ref) - 1, # Works also with nomalized insertion None, std_variant.chrom)
def testConsolidate(self): reg_list = RegionList([ Region(5, 9, "-", "chr1", "region1"), Region(10, 30, "-", "chr1", "region2"), Region(30, 40, "-", "chr1", "region3"), Region(35, 39, "-", "chr1", "region4"), Region(40, 70, "-", "chr1", "region5"), Region(71, 90, "-", "chr1", "region6"), Region(92, 100, "-", "chr1", "region7"), Region(100, 100, "+", "chr1", "region8"), Region(80, 100, "-", "chr2", "region9") ]) # Merge overlapping consolidated_reg = consolidated(reg_list, False) expected = ["chr1:5-9[-]", "chr1:10-70[-]", "chr1:71-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"] observed = [curr.getCoordinatesStr() for curr in consolidated_reg] self.assertEqual(expected, observed) # Merge overlapping and contiguous consolidated_reg = consolidated(reg_list, True) expected = ["chr1:5-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"] observed = [curr.getCoordinatesStr() for curr in consolidated_reg] self.assertEqual(expected, observed)
def variantsRegionFromVCF(vcf_path, min_count=1, symbol="GENE", hgvsc="CDS", hgvsp="AA", count="CNT"): """ Return the region object corresponding to the known variants in a VCF. :param vcf_path: Path to the variants file (format: VCF). :type vcf_path: str :param min_count: Minimum number of samples where the variant is known in the databases to use its information. :type min_count: int :param symbol: Tag used in VCF.info to store the symbol of the gene. :type symbol: str :param hgvsc: Tag used in VCF.info to store the HGVSc. :type hgvsc: str :param hgvsp: Tag used in VCF.info to store the HGVSp. :type hgvsp: str :param count: Tag used in VCF.info to store the number of database's samples with this variant. :type count: str :return: List of variants regions. :rtype: anacore.region.RegionList """ variants_region = None with VCFIO(vcf_path) as FH_in: variants_region = [ Region( record.pos, record.pos + len(record.ref), None, record.chrom, record.id, { "id": record.id, "gene": ("" if symbol not in record.info else record.info[symbol]), "HGVSp": ("" if hgvsp not in record.info else record.info[hgvsp]), "HGVSc": ("" if hgvsc not in record.info else record.info[hgvsc]), "count": (None if count not in record.info else int(record.info[count])) } ) for record in FH_in if (symbol not in record.info or "_ENST" not in record.info[symbol]) and (count not in record.info or int(record.info[count]) >= min_count) ] return RegionList(variants_region)
def exonsPos(record, genes_by_chr): """ Return by positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries. :param record: Breakdend record with CIPOS. :type record: anacore.vcf.VCFRecord :param genes_by_chr: By chromosomes a tree where nodes are genes, transcripts, protein, exons and CDS. :type genes_by_chr: dict :return: By positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries. :rtype: dict """ record_strand = getStrand(record) exons_pos = {} start, end = getBNDInterval(record) interval_region = Region(start, end, None, record.chrom, record.getName()) if record.chrom in genes_by_chr: overlapped_genes = genes_by_chr[record.chrom].getOverlapped( interval_region) for curr_gene in overlapped_genes: overlapped_transcripts = curr_gene.children.getOverlapped( interval_region) for curr_transcript in overlapped_transcripts: for subregion in curr_transcript.children.getOverlapped( interval_region): if record_strand == subregion.strand and issubclass( subregion.__class__, Exon): if interval_region.start <= subregion.start and interval_region.end >= subregion.start: # Breakend match to exon start if subregion.start not in exons_pos: exons_pos[subregion.start] = 1 else: exons_pos[subregion.start] += 1 if interval_region.start <= subregion.end and interval_region.end >= subregion.end: if subregion.end not in exons_pos: exons_pos[subregion.end] = 1 else: exons_pos[subregion.end] += 1 return exons_pos
def testGetTranscriptsAnnot_withoutUTR_oneExon(self): exon_1 = Exon(91, 150, "+", "chr1", "exon_2") cds_1 = CDS(91, 150, "+", "chr1", "cds_1") gene_1 = Gene(10, 350, None, "chr1", "gene_1", {"id": "g_1"}) transcrit_1 = Transcript(None, None, None, "chr1", "transcrit_1", {"id": "tr_1"}, parent=gene_1, children=[exon_1]) protein_1 = Protein(None, None, None, "chr1", "protein_2", children=[cds_1], transcript=transcrit_1) queries = [ Region(80, 160, None, "chr1", "query_1", {"desc": "starts before exon_1 ; ends after exon_1."}), Region( 91, 150, None, "chr1", "query_2", {"desc": "starts at start of exon_1 ; ends at end of exon_1." }), Region(100, 110, None, "chr1", "query_3", {"desc": "starts in exon_1 ; ends in exon_1."}), Region(80, 100, None, "chr1", "query_4", {"desc": "starts before exon_1 ; ends in exon_1."}), Region(110, 200, None, "chr1", "query_5", {"desc": "starts in exon_1 ; ends after exon_1."}), ] # Expected forward 1 exon expected = { "query_1": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_2": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_3": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 4, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 7 }, "query_4": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 4 }, "query_5": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 7, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, } for query_name, query_res in expected.items(): for key, val in { "SYMBOL": "gene_1", "Gene": "g_1", "Feature": "tr_1", "Feature_type": "Transcript", "STRAND": "1" }.items(): query_res[key] = val # Apply forward strand for exon in transcrit_1.children: exon.strand = "+" for cds in protein_1.children: cds.strand = "+" transcrit_1.sortChildren() protein_1.sortChildren() # Asert for curr_query in queries: annotations = getTranscriptsAnnot(curr_query, [transcrit_1]) self.assertEqual([expected[curr_query.name]], annotations) # Expected reverse 1 exon expected = { "query_1": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_2": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_3": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 14, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 17 }, "query_4": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 17, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 20 }, "query_5": { "start_EXON": "1/1", "start_INTRON": None, "start_Protein_position": 1, "end_EXON": "1/1", "end_INTRON": None, "end_Protein_position": 14 }, } for query_name, query_res in expected.items(): for key, val in { "SYMBOL": "gene_1", "Gene": "g_1", "Feature": "tr_1", "Feature_type": "Transcript", "STRAND": "-1" }.items(): query_res[key] = val # Apply reverse strand for exon in transcrit_1.children: exon.strand = "-" for cds in protein_1.children: cds.strand = "-" transcrit_1.sortChildren() protein_1.sortChildren() # Asert for curr_query in queries: annotations = getTranscriptsAnnot(curr_query, [transcrit_1]) self.assertEqual([expected[curr_query.name]], annotations)
def testStrandedContains(self): # Forward strand f_region = Region(9, 15, "+", "chr1") self.assertEqual( f_region.strandedContains(Region(9, 12, "+", "chr1")), True ) self.assertEqual( f_region.strandedContains(Region(9, 12, "-", "chr1")), False ) # Reverse strand r_region = Region(9, 15, "-", "chr1") self.assertEqual( r_region.strandedContains(Region(9, 12, "+", "chr1")), False ) self.assertEqual( r_region.strandedContains(Region(9, 12, "-", "chr1")), True )
) args = parser.parse_args() # Get transcripts gene_by_tr = getGeneByRefTr(args.input_reference_tr) selected_transcripts = getTranscriptAnnot(args.input_annotation, gene_by_tr) tr_by_chr = splittedByRef(selected_transcripts) # Write renamed regions out_nb_col = BEDIO.getMaxNbCol(args.input_regions) if out_nb_col == 3: out_nb_col = 4 with BEDIO(args.input_regions) as FH_regions: with BEDIO(args.output_regions, "w", out_nb_col) as FH_out: for record_idx, record in enumerate(FH_regions): target = Region(record.start, record.end, record.strand, record.chrom) if args.is_thick_based and record.thickStart is not None and record.thickEnd is not None: target.start = record.thickStart target.end = record.thickEnd overlapped_tr = list() if record.chrom in tr_by_chr: overlapped_tr = tr_by_chr[record.chrom].getOverlapped( target) if len(overlapped_tr) > 1: warnings.warn( "The region {} overlaps several transcripts ({}).". format(target, [str(tr) for tr in overlapped_tr])) if len(overlapped_tr) >= 1: overlapped_exons = overlapped_tr[0].children.getOverlapped( target) features = list()
def testGetMinDist(self): region = Region(9, 15, "+", "chr1") self.assertEqual( region.getMinDist(Region(14, 18, "+", "chr1")), 0 ) self.assertEqual( region.getMinDist(Region(16, 18, "+", "chr1")), 1 ) self.assertEqual( region.getMinDist(Region(1, 5, "+", "chr1")), 4 ) with self.assertRaises(Exception): region.getMinDist(Region(1, 5, "+", "chr2"))
def testHasOverlap(self): region = Region(9, 15, "+", "chr1") self.assertEqual( region.hasOverlap(Region(9, 9, "+", "chr1")), True ) self.assertEqual( region.hasOverlap(Region(15, 15, "+", "chr1")), True ) self.assertEqual( region.hasOverlap(Region(12, 13, "+", "chr1")), True ) self.assertEqual( region.hasOverlap(Region(9, 15, "+", "chr1")), True ) self.assertEqual( region.hasOverlap(Region(8, 14, "+", "chr1")), True ) self.assertEqual( region.hasOverlap(Region(10, 16, "+", "chr1")), True ) self.assertEqual( region.hasOverlap(Region(8, 16, "+", "chr1")), True ) self.assertEqual( region.hasOverlap(Region(8, 8, "+", "chr1")), False ) self.assertEqual( region.hasOverlap(Region(16, 16, "+", "chr1")), False ) self.assertEqual( region.hasOverlap(Region(12, 13, "+", "chr2")), False )
def testGetPosOnRegion(self): # Forward strand f_stranded_region = Region(9, 15, "+", "chr1") self.assertEqual(f_stranded_region.getPosOnRegion(9), 1) # First nt self.assertEqual(f_stranded_region.getPosOnRegion(11), 3) self.assertEqual(f_stranded_region.getPosOnRegion(15), 7) # Last nt with self.assertRaises(ValueError): f_stranded_region.getPosOnRegion(8) # Out of region with self.assertRaises(ValueError): f_stranded_region.getPosOnRegion(16) # Out of region # Reverse strand r_stranded_region = Region(9, 15, "-", "chr1") self.assertEqual(r_stranded_region.getPosOnRegion(15), 1) # First nt self.assertEqual(r_stranded_region.getPosOnRegion(13), 3) self.assertEqual(r_stranded_region.getPosOnRegion(9), 7) # Last nt with self.assertRaises(ValueError): r_stranded_region.getPosOnRegion(8) # Out of region with self.assertRaises(ValueError): r_stranded_region.getPosOnRegion(16) # Out of region
def isReadthrough(up, down, annotation_field, genes, rt_max_dist, annCmpName, regCmpName): """ Return True if the two breakends can be a readthrough. :param up: The breakend of the first shard in fusion. :type up: anacore.vcf.VCFRecord :param down: The breakend of the second shard in fusion. :type down: anacore.vcf.VCFRecord :param annotation_field: Field used to store annotations. :type annotation_field: str :param genes: The genes regions by chr. :type genes: AnnotGetter :param rt_max_dist: Maximum distance to evaluate if the fusion is a readthrough. :type rt_max_dist: int :param annCmpName: Callable used to return gene unique name from one VCF annotation. :type annCmpName: callable(annot) :param regCmpName: Callable used to return gene unique name from a gene region. :type regCmpName: callable(anacore.genomicRegion.Gene) :return: True if the two breakends can be a readthrough. :rtype: boolean """ is_readthrough = False if up.chrom == down.chrom: up_strand = getStrand(up, True) down_strand = getStrand(down, False) if (up_strand == "+" and down_strand == "+") or (up_strand == "-" and down_strand == "-"): # Readthrough are +/+ or -/- first = up second = down if first.pos > second.pos: first = down second = up first_start, first_end = getBNDInterval(first) second_start, second_end = getBNDInterval(second) interval_start = min(first_start, second_start) interval_end = max(first_end, second_end) + 1 if interval_end - interval_start <= rt_max_dist: first_bp_gene = {annCmpName(annot) for annot in first.info[annotation_field]} second_bp_gene = {annCmpName(annot) for annot in second.info[annotation_field]} full_overlapping_gene = first_bp_gene & second_bp_gene only_first_bp_gene = first_bp_gene - second_bp_gene only_second_bp_gene = second_bp_gene - first_bp_gene if len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0: strand_by_gene = {annCmpName(annot): annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field]} only_first_bp_gene = {gene for gene in only_first_bp_gene if strand_by_gene[gene] == up_strand} only_second_bp_gene = {gene for gene in only_second_bp_gene if strand_by_gene[gene] == up_strand} possible_on_strand = len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0 if possible_on_strand: interval_region = Region(interval_start, interval_end, up_strand, first.chrom) overlapped_genes = genes.getChr(first.chrom).getOverlapped(interval_region) overlapped_genes = RegionList([gene for gene in overlapped_genes if regCmpName(gene) not in full_overlapping_gene and gene.strand == up_strand]) overlapped_genes_by_id = {regCmpName(gene): gene for gene in overlapped_genes} contradict_readthrough = False for start_gene_id in only_first_bp_gene: start_gene = overlapped_genes_by_id[start_gene_id] for end_gene_id in only_second_bp_gene: end_gene = overlapped_genes_by_id[end_gene_id] for interval_gene in overlapped_genes: if regCmpName(interval_gene) != regCmpName(start_gene) and \ regCmpName(interval_gene) != regCmpName(end_gene): if not interval_gene.hasOverlap(start_gene) and not interval_gene.hasOverlap(end_gene): contradict_readthrough = True is_readthrough = not contradict_readthrough return is_readthrough
def testLength(self): self.assertEqual(Region(9, 15, None).length(), 7) self.assertEqual(Region(9, 15, "+").length(), 7) self.assertEqual(Region(9, 15, "-").length(), 7) self.assertEqual(Region(9, None, "-").length(), 1) self.assertEqual(Region(9, 9, "-").length(), 1)
def testContains(self): container_region = Region(9, 15, "+", "chr1") self.assertEqual( container_region.contains(Region(9, 9, "+", "chr1")), True ) self.assertEqual( container_region.contains(Region(15, 15, "+", "chr1")), True ) self.assertEqual( container_region.contains(Region(12, 13, "+", "chr1")), True ) self.assertEqual( container_region.contains(Region(9, 15, "+", "chr1")), True ) self.assertEqual( container_region.contains(Region(8, 14, "+", "chr1")), False ) self.assertEqual( container_region.contains(Region(10, 16, "+", "chr1")), False ) self.assertEqual( container_region.contains(Region(8, 16, "+", "chr1")), False ) self.assertEqual( container_region.contains(Region(8, 8, "+", "chr1")), False ) self.assertEqual( container_region.contains(Region(16, 16, "+", "chr1")), False ) self.assertEqual( container_region.contains(Region(12, 13, "+", "chr2")), False )
def testIterOverlapped_3(self): """Case where a subject is included in another.""" # Init test data sbjct_1 = Region(7, 10, "+", "chr1", "sbjct_1") sbjct_2 = Region(14, 20, "+", "chr1", "sbjct_2") sbjct_3 = Region(16, 18, "+", "chr1", "sbjct_3") sbjct_4 = Region(24, 29, "+", "chr1", "sbjct_4") subjects = RegionList([sbjct_1, sbjct_2, sbjct_3, sbjct_4]) queries_info = [ {"query": Region(11, 11, "+", "chr1", "query_l1_01"), "overlapped": []}, {"query": Region(12, 12, "+", "chr1", "query_l1_02"), "overlapped": []}, {"query": Region(13, 13, "+", "chr1", "query_l1_03"), "overlapped": []}, {"query": Region(14, 14, "+", "chr1", "query_l1_04"), "overlapped": [sbjct_2]}, {"query": Region(15, 15, "+", "chr1", "query_l1_05"), "overlapped": [sbjct_2]}, {"query": Region(16, 16, "+", "chr1", "query_l1_06"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 17, "+", "chr1", "query_l1_07"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 18, "+", "chr1", "query_l1_08"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(19, 19, "+", "chr1", "query_l1_09"), "overlapped": [sbjct_2]}, {"query": Region(20, 20, "+", "chr1", "query_l1_10"), "overlapped": [sbjct_2]}, {"query": Region(21, 21, "+", "chr1", "query_l1_11"), "overlapped": []}, {"query": Region(22, 22, "+", "chr1", "query_l1_12"), "overlapped": []}, {"query": Region(11, 13, "+", "chr1", "query_l3_01"), "overlapped": []}, {"query": Region(12, 14, "+", "chr1", "query_l3_02"), "overlapped": [sbjct_2]}, {"query": Region(13, 15, "+", "chr1", "query_l3_03"), "overlapped": [sbjct_2]}, {"query": Region(14, 16, "+", "chr1", "query_l3_04"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 17, "+", "chr1", "query_l3_05"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(16, 18, "+", "chr1", "query_l3_06"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 19, "+", "chr1", "query_l3_07"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 20, "+", "chr1", "query_l3_08"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(19, 21, "+", "chr1", "query_l3_09"), "overlapped": [sbjct_2]}, {"query": Region(20, 22, "+", "chr1", "query_l3_10"), "overlapped": [sbjct_2]}, {"query": Region(21, 23, "+", "chr1", "query_l3_11"), "overlapped": []}, {"query": Region(13, 17, "+", "chr1", "query_l5_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 19, "+", "chr1", "query_l5_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 21, "+", "chr1", "query_l5_03"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 22, "+", "chr1", "query_l5_04"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(19, 23, "+", "chr1", "query_l5_05"), "overlapped": [sbjct_2]}, {"query": Region(20, 24, "+", "chr1", "query_l5_06"), "overlapped": [sbjct_2, sbjct_4]}, {"query": Region(13, 18, "+", "chr1", "query_l6_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(14, 19, "+", "chr1", "query_l6_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 20, "+", "chr1", "query_l6_03"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(16, 21, "+", "chr1", "query_l6_04"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 22, "+", "chr1", "query_l6_05"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 23, "+", "chr1", "query_l6_06"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(19, 24, "+", "chr1", "query_l6_07"), "overlapped": [sbjct_2, sbjct_4]}, {"query": Region(13, 19, "+", "chr1", "query_l7_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(14, 20, "+", "chr1", "query_l7_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 21, "+", "chr1", "query_l7_03"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(16, 22, "+", "chr1", "query_l7_04"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 23, "+", "chr1", "query_l7_05"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 24, "+", "chr1", "query_l7_06"), "overlapped": [sbjct_2, sbjct_3, sbjct_4]}, {"query": Region(19, 24, "+", "chr1", "query_l7_07"), "overlapped": [sbjct_2, sbjct_4]}, {"query": Region(13, 20, "+", "chr1", "query_l8_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(14, 21, "+", "chr1", "query_l8_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 22, "+", "chr1", "query_l8_03"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(13, 21, "+", "chr1", "query_l9_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(14, 22, "+", "chr1", "query_l9_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(13, 22, "+", "chr1", "query_l10_01"), "overlapped": [sbjct_2, sbjct_3]} ] queries_info = sorted(queries_info, key=lambda x: (x["query"].start, x["query"].end)) # Independant evaluation for curr_eval in queries_info: obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped([curr_eval["query"]], subjects)] self.assertEqual(obs_overlapped, [curr_eval["overlapped"]]) # Grouped evaluation queries = [curr_info["query"] for curr_info in queries_info] expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info] obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, subjects)] self.assertEqual(obs_overlapped, expec_overlapped) # Grouped evaluation and inclusion between subjects starts the list of subjects shifted_subjects = subjects[1:] queries = [curr_info["query"] for curr_info in queries_info] expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info] obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, shifted_subjects)] self.assertEqual(obs_overlapped, expec_overlapped) # Grouped evaluation and inclusion between subjects ends the list of subjects poped_subjects = subjects[:-1] queries = [curr_info["query"] for curr_info in queries_info] expec_overlapped = [] for curr_info in queries_info: expec_overlapped.append([elt for elt in curr_info["overlapped"] if elt != sbjct_4]) obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, poped_subjects)] self.assertEqual(obs_overlapped, expec_overlapped)
def testHasStrandedOverlap(self): # Forward strand f_region = Region(9, 15, "+", "chr1") self.assertEqual( f_region.hasStrandedOverlap(Region(5, 12, "+", "chr1")), True ) self.assertEqual( f_region.hasStrandedOverlap(Region(5, 12, "-", "chr1")), False ) # Reverse strand r_region = Region(9, 15, "-", "chr1") self.assertEqual( r_region.hasStrandedOverlap(Region(5, 12, "+", "chr1")), False ) self.assertEqual( r_region.hasStrandedOverlap(Region(5, 12, "-", "chr1")), True )
def groupBNDByFusions(bnd_by_id, annotation_field): """ Return by chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second). :param bnd_by_id: Breakend by ID coming from one fusion caller. :type bnd_by_id: dict :param annotation_field: Field used to store annotations. :type annotation_field: str :return: By chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second). :rtype: dict """ caller_fusions = dict() processed_fusions = set() fusion_by_name = {} for id, record in bnd_by_id.items(): for alt_idx, alt in enumerate(record.alt): alt_first_bnd = record first_new_id = alt_first_bnd.id if len(record.alt) > 1: first_new_id += "_" + str( alt_idx) # Record must be splitted for each mate alt_first_bnd = getAlleleRecord(record, alt_idx) alt_first_bnd.info["MATEID"] = [record.info["MATEID"][alt_idx]] mate_id = alt_first_bnd.info["MATEID"][0] mate_record = bnd_by_id[mate_id] alt_second_bnd = mate_record second_new_id = alt_second_bnd.id if len(mate_record.alt) > 1: first_idx = mate_record.info["MATEID"].index(alt_first_bnd.id) second_new_id += "_" + first_idx # Record must be splitted for each mate alt_second_bnd = getAlleleRecord(mate_record, first_idx) alt_second_bnd.info["MATEID"] = [ mate_record.info["MATEID"][first_idx] ] fusion_id = " @@ ".join( sorted([alt_first_bnd.id, alt_second_bnd.id])) alt_first_bnd.id = first_new_id alt_second_bnd.info["MATEID"] = [first_new_id] alt_second_bnd.id = second_new_id alt_first_bnd.info["MATEID"] = [second_new_id] if fusion_id not in processed_fusions: processed_fusions.add(fusion_id) if "RNA_FIRST" not in alt_first_bnd.info and "RNA_FIRST" not in alt_second_bnd.info: raise Exception( "Tag RNA_FIRST must be present in one of the breakend {} or {}." .format(alt_first_bnd.id, mate_id)) if "RNA_FIRST" in alt_second_bnd.info: aux = alt_first_bnd alt_first_bnd = alt_second_bnd alt_second_bnd = aux interval_first_bnd = getBNDInterval(alt_first_bnd) fusion_name = " @@ ".join( sorted([alt_first_bnd.getName(), alt_second_bnd.getName()])) if fusion_name not in fusion_by_name: region_first_bnd = Region(interval_first_bnd[0], interval_first_bnd[1], reference=alt_first_bnd.chrom, annot={ "first": alt_first_bnd, "second": alt_second_bnd }) if alt_first_bnd.chrom not in caller_fusions: caller_fusions[alt_first_bnd.chrom] = RegionList() caller_fusions[alt_first_bnd.chrom].append( region_first_bnd) fusion_by_name[fusion_name] = region_first_bnd else: # Caller contains several entries for the same pair of breakends (same fusion but several anotations) fusion_by_name[fusion_name].annot["first"].info[ annotation_field] += alt_first_bnd.info[ annotation_field] fusion_by_name[fusion_name].annot["second"].info[ annotation_field] += alt_second_bnd.info[ annotation_field] return caller_fusions
def testFromStr(self): observed = Region.fromStr("12:1534187-1534287") expected = Region(1534187, 1534287, None, "12") self.assertEqual(str(observed), str(expected))