def getTargets(in_aln, in_targets=None): """ Return the list of targeted regions. :param in_aln: Path to the alignment file (format: SAM/BAM). :type in_aln: str :param in_targets: Path to the targeted regions (format: BED). They must not contains any overlap. :type in_targets: str :return: List of targeted regions. :rtype: anacore.region.RegionList """ selected_regions = RegionList() if in_targets is None: with pysam.AlignmentFile(in_aln, "rb") as FH_bam: for ref_info in FH_bam.header["SQ"]: selected_regions.append( Region(1, ref_info["LN"], "+", ref_info["SN"], ref_info["SN"]) ) else: selected_regions = getAreas(in_targets) # Check lack of overlap selected_regions = sorted(selected_regions, key=lambda x: (x.reference.name, x.start, x.end)) prev_region = selected_regions[0] for curr_region in selected_regions[1:]: if curr_region.reference.name == prev_region.reference.name: if prev_region.end >= curr_region.start: raise Exception("The regions {} and {} contains an overlap.".format(prev_region, curr_region)) prev_region = curr_region return selected_regions
def shallowFromAlignment(aln_path, selected_regions, depth_mode, min_depth, log): """ Return the list of shallow regions from the alignment file. :param aln_path: Path to the alignment file (format: SAM/BAM). :type aln_path: str :param selected_regions: Targeted regions. They must not contains any overlap between them. :type selected_regions: anacore.region.RegionList :param depth_mode: How count the depth: by reads (each reads is added independently) or by fragment (the R1 and R2 coming from the same pair are counted only once). :type depth_mode: str :param min_depth: All the locations with a depth under this value are reported in shallows areas. :type min_depth: int :param log: Logger of the script. :type log: logging.Logger :return: List of shallow regions. :rtype: anacore.region.RegionList """ shallow = RegionList() nb_selected_regions = len(selected_regions) idx_in_part = 1 with pysam.AlignmentFile(aln_path, "rb") as FH_bam: for idx_region, region in enumerate(selected_regions): if idx_in_part > nb_selected_regions / 10: idx_in_part = 0 log.info("Processed regions {}/{}.".format(idx_region + 1, nb_selected_regions)) idx_in_part += 1 prev_opened = {"start": None, "end": None} curr_checked = region.start - 1 for pileupcolumn in FH_bam.pileup(region.reference.name, region.start - 1, region.end - 1, max_depth=100000000): if pileupcolumn.reference_pos + 1 >= region.start and pileupcolumn.reference_pos + 1 <= region.end: # Missing positions while curr_checked < pileupcolumn.reference_pos: addToShallow(region.reference, curr_checked, prev_opened, shallow) curr_checked += 1 # Current position curr_reads_depth = 0 curr_frag = set() for pileupread in pileupcolumn.pileups: if pileupcolumn.reference_pos + 1 < region.start or pileupcolumn.reference_pos + 1 > region.end: raise Exception("The reference position {}:{} is out of target {}.".format(region.reference.name, pileupcolumn.reference_pos, region)) if not pileupread.alignment.is_secondary and not pileupread.alignment.is_duplicate and not pileupread.is_refskip: curr_reads_depth += 1 curr_frag.add(pileupread.alignment.query_name) curr_depth = curr_reads_depth if depth_mode == "fragment": curr_depth = len(curr_frag) if min_depth > curr_depth: addToShallow(region.reference, pileupcolumn.reference_pos, prev_opened, shallow) curr_checked = pileupcolumn.reference_pos + 1 # Missing positions while curr_checked < region.end: addToShallow(region.reference, curr_checked, prev_opened, shallow) curr_checked += 1 if prev_opened["start"] is not None: shallow.append( Region(prev_opened["start"] + 1, prev_opened["end"] + 1, "+", region.reference) ) return shallow
def getAreas(in_bed): """ @summary: Returns the list of areas from a BED file. @param input_areas: [str] The path to the areas description (format: BED). @returns: [RegionList] The list of areas. """ areas = RegionList() with BEDIO(in_bed) as FH_panel: areas = RegionList(FH_panel.read()) return areas
def getAreas(in_bed): """ Return the list of areas from a BED file. :param in_bed: The path to the areas description (format: BED). :type in_bed: str :return: The list of areas. :rtype: region.RegionList """ areas = RegionList() with BEDIO(in_bed) as FH_panel: areas = RegionList(FH_panel.read()) return areas
def getPrimersByChr(in_regions): """ @summary: Returns the list of primers by chromosome. @param in_regions: [str] Path to the amplicons design with their primers (format: BED). The zone of interest is defined by thickStart and thickEnd. @return: [dict] By chromosome an instance of RegionList containing the primers. Each primer has an location annotation: upstream or downstream (this information is strand + based). """ primers_by_chr = dict() with BEDIO(in_regions) as FH_in: for record in FH_in: if record.chrom not in primers_by_chr: primers_by_chr[record.chrom] = RegionList() if record.thickStart is None or record.thickEnd is None: raise Exception( 'The BED file "' + in_regions + '" does not contains thickStart and thickEnd for all the amplicons.' ) upstream_primer = Region(record.start, record.thickStart - 1, record.strand, record.reference, None, {"location": "upstream"}) primers_by_chr[record.chrom].append(upstream_primer) downstream_primer = Region(record.thickEnd + 1, record.end, record.strand, record.reference, None, {"location": "downstream"}) primers_by_chr[record.chrom].append(downstream_primer) return primers_by_chr
def testShallowFromAlignment(self): """ art_chr1: 10 20 30 40 50 60 70 80 90 100 110 120 123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678 ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT *******.************************************************** ******************************************.********* TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG ------------------------------------------------------------------------------------------------------------------ art_chr2: 10 20 30 40 50 60 70 80 90 100 110 120 123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678 ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT ********************** ********************** ******************************************** *************************************************** AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAA CATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC ------------------------------------------------------------------------------------------- art_chr3: 10 20 30 40 50 60 70 123456789|123456789|123456789|123456789|123456789|123456789|123456789|12 ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT ------------------------------- """ with open(self.tmp_sam, "w") as writer: writer.write("""@SQ SN:art_chr1 LN:128 @SQ SN:art_chr2 LN:128 @SQ SN:art_chr3 LN:72 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fasta reads.fasta read_1 0 art_chr1 12 60 3S58M * 0 0 TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA * NM:i:1 MD:Z:7G50 AS:i:53 XS:i:0 read_2 0 art_chr1 71 60 52M * 0 0 ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG * NM:i:1 MD:Z:42G9 AS:i:47 XS:i:0 read_3 0 art_chr2 8 60 44M8D51M * 0 0 AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAACATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC * NM:i:8 MD:Z:44^TACTAAAT51 AS:i:81 XS:i:0 """) samToBam(self.tmp_sam, self.tmp_bam) pysam.index(self.tmp_bam) class FakeLogger: def info(self, msg): pass selected_regions = RegionList([ Region(10, 123, None, "art_chr1"), Region(10, 100, None, "art_chr2"), Region(10, 40, None, "art_chr3"), ]) expected = [ "art_chr1:10-11", "art_chr1:70-70", "art_chr1:123-123", "art_chr3:10-40", ] observed = [ str(elt) for elt in shallowFromAlignment( self.tmp_bam, selected_regions, "reads", 1, FakeLogger()) ] self.assertEqual(sorted(expected), sorted(observed))
def getAreasByChr(in_bed): """ @summary: Returns from a BED file the list of areas by chromosome. @param input_areas: [str] The path to the areas description (format: BED). @returns: [dict] The list of areas by chromosome (each list is an instance of Regionlist). """ areas_by_chr = dict() for curr_area in getAreas(in_bed): chrom = curr_area.reference.name if chrom not in areas_by_chr: areas_by_chr[chrom] = RegionList() areas_by_chr[chrom].append(curr_area) return areas_by_chr
def getSortedAreasByChr(in_bed): """ Return by chromosome the list of sorted areas from a BED file. :param in_bed: The path to the areas description (format: BED). :type in_bed: str :return: The list of sorted areas by chromosome (each list is an instance of region.Regionlist). :rtype: dict """ areas_by_chr = {} for chrom, areas in getAreasByChr(in_bed).items(): areas_by_chr[chrom] = RegionList( sorted(areas, key=lambda x: (x.start, x.end))) return areas_by_chr
def testSplittedByRef(self): reg_list = RegionList([ Region(10, 30, "-", "chr1", "region1"), Region(40, 70, "-", "chr1", "region2"), Region(80, 100, "-", "chr2", "region3") ]) reg_by_chr = splittedByRef(reg_list) expected = ["chr1:region1", "chr1:region2", "chr2:region3"] observed = [] for chrom, regions in sorted(reg_by_chr.items()): named_regions = [] for curr_region in regions: named_regions.append("{}:{}".format(chrom, curr_region.name)) observed.extend(named_regions) self.assertEqual(expected, observed)
def proteins(self, proteins): """ Change proteins linked with transcript. Before the add of the new proteins the old are unlinked (prot.transcript is set to None). :param proteins: The new linked proteins. :type proteins: list """ # Remove previous proteins for idx_prot, prot in enumerate(self.proteins): prot.transcript = None # Remove link in protein self._proteins = RegionList() # Remove link in transcript # Add new proteins if proteins is not None: for curr_prot in proteins: curr_prot.transcript = self
def getAreasByChr(in_bed): """ Return by chromosome the list of areas from a BED file. :param in_bed: The path to the areas description (format: BED). :type in_bed: str :return: The list of areas by chromosome (each list is an instance of region.Regionlist). :rtype: dict """ areas_by_chr = dict() for curr_area in getAreas(in_bed): chrom = curr_area.reference.name if chrom not in areas_by_chr: areas_by_chr[chrom] = RegionList() areas_by_chr[chrom].append(curr_area) return areas_by_chr
def filteredByOverlap(targets_by_chr, selected_genes): """ Return targeted areas overlapping selected genes. :param targets_by_chr: RegionList of targets by chromosomes. :type targets_by_chr: dict :param selected_genes: Translocated genes. Each element is a Gene object from genomic annotations. :type selected_genes: list :return: Targeted areas overlapping selected genes. :rtype: dict """ # Genes by chromosome genes_by_chr = dict() for gene in selected_genes: chrom = gene.reference.name if chrom in selected_genes: genes_by_chr[chrom].append(gene) else: genes_by_chr[chrom] = RegionList([gene]) # Find overlaps between targets and selected genes trimmed_targets_by_chr = dict() for chrom, genes in genes_by_chr.items(): overlaps = list() for gene, targets in iterOverlapped(genes_by_chr[chrom], targets_by_chr[chrom]): for curr in targets: overlaps.append( [max(gene.start, curr.start), min(gene.end, curr.end)]) consolidated_overlaps = list() if len(overlaps) > 0: overlaps = sorted(overlaps, key=lambda x: (x[0], x[1])) consolidated_overlaps = [overlaps[0]] prev = overlaps[0] for curr in overlaps[1:]: if curr[0] > prev[1]: consolidated_overlaps.append(curr) prev = curr else: prev[1] = max(curr[1], prev[1]) trimmed_targets_by_chr[chrom] = consolidated_overlaps return trimmed_targets_by_chr
def testConsolidate(self): reg_list = RegionList([ Region(5, 9, "-", "chr1", "region1"), Region(10, 30, "-", "chr1", "region2"), Region(30, 40, "-", "chr1", "region3"), Region(35, 39, "-", "chr1", "region4"), Region(40, 70, "-", "chr1", "region5"), Region(71, 90, "-", "chr1", "region6"), Region(92, 100, "-", "chr1", "region7"), Region(100, 100, "+", "chr1", "region8"), Region(80, 100, "-", "chr2", "region9") ]) # Merge overlapping consolidated_reg = consolidated(reg_list, False) expected = ["chr1:5-9[-]", "chr1:10-70[-]", "chr1:71-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"] observed = [curr.getCoordinatesStr() for curr in consolidated_reg] self.assertEqual(expected, observed) # Merge overlapping and contiguous consolidated_reg = consolidated(reg_list, True) expected = ["chr1:5-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"] observed = [curr.getCoordinatesStr() for curr in consolidated_reg] self.assertEqual(expected, observed)
def variantsRegionFromVCF(vcf_path, min_count=1, symbol="GENE", hgvsc="CDS", hgvsp="AA", count="CNT"): """ Return the region object corresponding to the known variants in a VCF. :param vcf_path: Path to the variants file (format: VCF). :type vcf_path: str :param min_count: Minimum number of samples where the variant is known in the databases to use its information. :type min_count: int :param symbol: Tag used in VCF.info to store the symbol of the gene. :type symbol: str :param hgvsc: Tag used in VCF.info to store the HGVSc. :type hgvsc: str :param hgvsp: Tag used in VCF.info to store the HGVSp. :type hgvsp: str :param count: Tag used in VCF.info to store the number of database's samples with this variant. :type count: str :return: List of variants regions. :rtype: anacore.region.RegionList """ variants_region = None with VCFIO(vcf_path) as FH_in: variants_region = [ Region( record.pos, record.pos + len(record.ref), None, record.chrom, record.id, { "id": record.id, "gene": ("" if symbol not in record.info else record.info[symbol]), "HGVSp": ("" if hgvsp not in record.info else record.info[hgvsp]), "HGVSc": ("" if hgvsc not in record.info else record.info[hgvsc]), "count": (None if count not in record.info else int(record.info[count])) } ) for record in FH_in if (symbol not in record.info or "_ENST" not in record.info[symbol]) and (count not in record.info or int(record.info[count]) >= min_count) ] return RegionList(variants_region)
def __init__(self, start=None, end=None, strand=None, reference=None, name=None, annot=None, parent=None, children=None, proteins=None): """ Build and return an instance of Transcript. :param start: The start position on the reference. This position is 1-based and ascending (start <= end). :type start: int :param end: The end position on the reference. This position is 1-based and ascending (start <= end). [Default: start] :type end: int :param strand: The strand of the instance ("+" or "-"). :type strand: str :param reference: The region object or the region name of the reference. :type reference: region.Region | str :param name: The name of the region. :type name: str :param annot: The annotations of the region. :type annot: dict :param parent: The gene. :type parent: region.RegionTree :param children: The list of exons. :type children: region.RegionList :param proteins: The list of proteins produced on transcript. It can exists several proteins on transcript: operon and readthrough. :type proteins: region.RegionList :return: The new instance. :rtype: genomicRegion.Transcript """ RegionTree.__init__(self, start, end, strand, reference, name, annot, parent, children) self._proteins = RegionList() self.proteins = proteins
def getTranscriptAnnot(in_annot, gene_by_tr): """ Get genomic model (genes, transcripts and exons) for the selected transcripts. :param in_annot: Path to the genomic annotations (format: GFF3). :type in_annot: str :param gene_by_tr: Gene by selected transcripts. :type gene_by_tr: dict :return: The list of selected transcripts. :rtype: anacore.region.RegionList """ tr_by_id = dict() with GFF3IO(in_annot) as FH_annot: for record in FH_annot: if record.type == "mRNA" and "transcript_id" in record.annot: tr_id = record.annot["transcript_id"] tr_id = tr_id.split(".")[0] # Remove transcript version if tr_id in gene_by_tr: # Transcript is in panel if tr_id not in tr_by_id: tr_by_id[tr_id] = Transcript(record.start, record.end, record.strand, record.seq_id, tr_id, {}, gene_by_tr[tr_id]) if record.type == "exon" and "transcript_id" in record.annot: tr_id = record.annot["transcript_id"] tr_id = tr_id.split(".")[0] # Remove transcript version if tr_id in gene_by_tr: # Transcript is in panel # Store the exon tr_by_id[tr_id].addChild( Exon(record.start, record.end, record.strand, record.seq_id)) if len(gene_by_tr) != len(tr_by_id): raise Exception( "The following transcripts are missing in {}: {}".format( args.input_annotation, set(gene_by_tr.keys()).difference(set(tr_by_id.keys())))) return RegionList(tr_by_id.values())
def getCDSFromTranscript(self): """ Return CDS of the protein from the transcript and his exons. This function is used when CDS are not defined in the protein but exons and protein start and end are defined. :return: The list of CDS of the protein in protein strand order. :rtype: region.Regionlist """ # Check information completion if self.transcript is None: raise Exception( "A link with a transcript is required to return CDS for {}.". format(self)) if self.start is None or self.end is None: raise Exception( "Start and end for {} are required to return CDS from transcript {}." .format(self, self.transcript)) # Exons to CDS exons = sorted(self.transcript.children, key=lambda x: (x.start, x.end)) nb_exons = len(exons) idx_exon = 0 curr_exon = exons[idx_exon] while self.start > curr_exon.end: idx_exon += 1 curr_exon = exons[idx_exon] cds = RegionList() while curr_exon is not None and self.end >= curr_exon.start: cds_start = max(self.start, curr_exon.start) cds_end = min(self.end, curr_exon.end) cds.append(CDS(cds_start, cds_end, self.strand, self.reference)) idx_exon += 1 curr_exon = None if idx_exon < nb_exons: curr_exon = exons[idx_exon] # Sort by strand order if self.strand == "-": cds = RegionList( sorted(cds, key=lambda x: (x.end, x.start), reverse=True)) # Return return cds
def isReadthrough(up, down, annotation_field, genes, rt_max_dist, annCmpName, regCmpName): """ Return True if the two breakends can be a readthrough. :param up: The breakend of the first shard in fusion. :type up: anacore.vcf.VCFRecord :param down: The breakend of the second shard in fusion. :type down: anacore.vcf.VCFRecord :param annotation_field: Field used to store annotations. :type annotation_field: str :param genes: The genes regions by chr. :type genes: AnnotGetter :param rt_max_dist: Maximum distance to evaluate if the fusion is a readthrough. :type rt_max_dist: int :param annCmpName: Callable used to return gene unique name from one VCF annotation. :type annCmpName: callable(annot) :param regCmpName: Callable used to return gene unique name from a gene region. :type regCmpName: callable(anacore.genomicRegion.Gene) :return: True if the two breakends can be a readthrough. :rtype: boolean """ is_readthrough = False if up.chrom == down.chrom: up_strand = getStrand(up, True) down_strand = getStrand(down, False) if (up_strand == "+" and down_strand == "+") or (up_strand == "-" and down_strand == "-"): # Readthrough are +/+ or -/- first = up second = down if first.pos > second.pos: first = down second = up first_start, first_end = getBNDInterval(first) second_start, second_end = getBNDInterval(second) interval_start = min(first_start, second_start) interval_end = max(first_end, second_end) + 1 if interval_end - interval_start <= rt_max_dist: first_bp_gene = {annCmpName(annot) for annot in first.info[annotation_field]} second_bp_gene = {annCmpName(annot) for annot in second.info[annotation_field]} full_overlapping_gene = first_bp_gene & second_bp_gene only_first_bp_gene = first_bp_gene - second_bp_gene only_second_bp_gene = second_bp_gene - first_bp_gene if len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0: strand_by_gene = {annCmpName(annot): annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field]} only_first_bp_gene = {gene for gene in only_first_bp_gene if strand_by_gene[gene] == up_strand} only_second_bp_gene = {gene for gene in only_second_bp_gene if strand_by_gene[gene] == up_strand} possible_on_strand = len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0 if possible_on_strand: interval_region = Region(interval_start, interval_end, up_strand, first.chrom) overlapped_genes = genes.getChr(first.chrom).getOverlapped(interval_region) overlapped_genes = RegionList([gene for gene in overlapped_genes if regCmpName(gene) not in full_overlapping_gene and gene.strand == up_strand]) overlapped_genes_by_id = {regCmpName(gene): gene for gene in overlapped_genes} contradict_readthrough = False for start_gene_id in only_first_bp_gene: start_gene = overlapped_genes_by_id[start_gene_id] for end_gene_id in only_second_bp_gene: end_gene = overlapped_genes_by_id[end_gene_id] for interval_gene in overlapped_genes: if regCmpName(interval_gene) != regCmpName(start_gene) and \ regCmpName(interval_gene) != regCmpName(end_gene): if not interval_gene.hasOverlap(start_gene) and not interval_gene.hasOverlap(end_gene): contradict_readthrough = True is_readthrough = not contradict_readthrough return is_readthrough
def groupBNDByFusions(bnd_by_id, annotation_field): """ Return by chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second). :param bnd_by_id: Breakend by ID coming from one fusion caller. :type bnd_by_id: dict :param annotation_field: Field used to store annotations. :type annotation_field: str :return: By chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second). :rtype: dict """ caller_fusions = dict() processed_fusions = set() fusion_by_name = {} for id, record in bnd_by_id.items(): for alt_idx, alt in enumerate(record.alt): alt_first_bnd = record first_new_id = alt_first_bnd.id if len(record.alt) > 1: first_new_id += "_" + str( alt_idx) # Record must be splitted for each mate alt_first_bnd = getAlleleRecord(record, alt_idx) alt_first_bnd.info["MATEID"] = [record.info["MATEID"][alt_idx]] mate_id = alt_first_bnd.info["MATEID"][0] mate_record = bnd_by_id[mate_id] alt_second_bnd = mate_record second_new_id = alt_second_bnd.id if len(mate_record.alt) > 1: first_idx = mate_record.info["MATEID"].index(alt_first_bnd.id) second_new_id += "_" + first_idx # Record must be splitted for each mate alt_second_bnd = getAlleleRecord(mate_record, first_idx) alt_second_bnd.info["MATEID"] = [ mate_record.info["MATEID"][first_idx] ] fusion_id = " @@ ".join( sorted([alt_first_bnd.id, alt_second_bnd.id])) alt_first_bnd.id = first_new_id alt_second_bnd.info["MATEID"] = [first_new_id] alt_second_bnd.id = second_new_id alt_first_bnd.info["MATEID"] = [second_new_id] if fusion_id not in processed_fusions: processed_fusions.add(fusion_id) if "RNA_FIRST" not in alt_first_bnd.info and "RNA_FIRST" not in alt_second_bnd.info: raise Exception( "Tag RNA_FIRST must be present in one of the breakend {} or {}." .format(alt_first_bnd.id, mate_id)) if "RNA_FIRST" in alt_second_bnd.info: aux = alt_first_bnd alt_first_bnd = alt_second_bnd alt_second_bnd = aux interval_first_bnd = getBNDInterval(alt_first_bnd) fusion_name = " @@ ".join( sorted([alt_first_bnd.getName(), alt_second_bnd.getName()])) if fusion_name not in fusion_by_name: region_first_bnd = Region(interval_first_bnd[0], interval_first_bnd[1], reference=alt_first_bnd.chrom, annot={ "first": alt_first_bnd, "second": alt_second_bnd }) if alt_first_bnd.chrom not in caller_fusions: caller_fusions[alt_first_bnd.chrom] = RegionList() caller_fusions[alt_first_bnd.chrom].append( region_first_bnd) fusion_by_name[fusion_name] = region_first_bnd else: # Caller contains several entries for the same pair of breakends (same fusion but several anotations) fusion_by_name[fusion_name].annot["first"].info[ annotation_field] += alt_first_bnd.info[ annotation_field] fusion_by_name[fusion_name].annot["second"].info[ annotation_field] += alt_second_bnd.info[ annotation_field] return caller_fusions
def getMergedRecords(inputs_variants, calling_sources, annotation_field, shared_filters): """ Merge VCFRecords coming from several variant callers. :param inputs_variants: Pathes to the variants files. :type inputs_variants: list :param calling_sources: Names of the variants callers (in same order as inputs_variants). :type calling_sources: list :param annotation_field: Field used to store annotations. :type annotation_field: str :param shared_filters: Filters tags applying to the variant and independent of caller like filters on annotations. These filters are not renamed to add caller ID as suffix. :type shared_filters: set :return: Merged VCF records. :rtype: list """ whole_fusions = {} # first bnd region by chromosome for idx_in, curr_in in enumerate(inputs_variants): curr_caller = calling_sources[idx_in] log.info("Process {}".format(curr_caller)) # breakend by id bnd_by_id = loadBNDByID(curr_in) # Group by fusion curr_caller_fusions = groupBNDByFusions(bnd_by_id, annotation_field) # Merge to other callers new_fusions = [] for chrom, query, overlapped in iterOverlappedByRegion( curr_caller_fusions, whole_fusions): records = (query.annot["first"], query.annot["second"]) # Extract PR and SR support_by_spl = {} for spl, data in records[0].samples.items(): support_by_spl[spl] = { "PR": getCount(data, "PR"), "SR": getCount(data, "SR") } # Get identical fusion from previous callers prev_records = getPrevFusion(records, overlapped, curr_caller) # Rename fields for curr_record in records: renameFields(curr_record, "s{}".format(idx_in), shared_filters) # Add to storage if prev_records is None: # Prepare new fusion new_fusions.append(query) for curr_record in records: # Data source curr_record.info["SRC"] = [curr_caller] curr_record.info["REFSRC"] = curr_caller curr_record.info["IDSRC"] = [curr_record.id] # CIPOS if "s{}_CIPOS".format(idx_in) in curr_record.info: curr_record.info["CIPOS"] = curr_record.info[ "s{}_CIPOS".format(idx_in)] # Quality if idx_in != 0: curr_record.qual = None # For consistency, the quality of the variant comes only from the first caller of the variant # SR and PR by sample (from the first caller finding the variant: callers are in user order) curr_record.format.insert(0, "SRSRC") curr_record.format.insert(0, "PRSRC") curr_record.format.insert(0, "SR") curr_record.format.insert(0, "PR") for spl_name, spl_data in curr_record.samples.items(): spl_data["SR"] = support_by_spl[spl_name]["SR"] spl_data["PR"] = support_by_spl[spl_name]["PR"] spl_data["SRSRC"] = [support_by_spl[spl_name]["SR"]] spl_data["PRSRC"] = [support_by_spl[spl_name]["PR"]] else: # Update previous fusion for prev_rec, curr_rec in zip(prev_records, records): prev_rec.info["SRC"].append(curr_caller) prev_rec.info["IDSRC"].append(curr_rec.id) # FILTERS new_filters = set(curr_rec.filter) - { "Imprecise" } # Imprecise is take into accout only for the first caller to keep consistency with CIPOS prev_rec.filter = list(set(prev_rec.filter) or new_filters) # FORMAT prev_rec.format.extend(curr_rec.format) # INFO del (curr_rec.info["MATEID"]) if "IMPRECISE" in curr_rec.info: del ( curr_rec.info["IMPRECISE"] ) # Imprecise is take into accout only for the first caller to keep consistency with CIPOS prev_rec.info.update(curr_rec.info) # SAMPLES for spl_name, spl_data in prev_rec.samples.items(): spl_data.update(curr_rec.samples[spl_name]) spl_data["SRSRC"].append( support_by_spl[spl_name]["SR"]) spl_data["PRSRC"].append( support_by_spl[spl_name]["PR"]) # Add new fusions in whole_fusions for curr in new_fusions: if curr.reference.name not in whole_fusions: whole_fusions[curr.reference.name] = RegionList() whole_fusions[curr.reference.name].append(curr) # Sort fusions by first breakend for chrom, fusions in whole_fusions.items(): whole_fusions[chrom] = RegionList( sorted(fusions, key=lambda x: (x.start, x.end))) # Flatten fusions returned_fusions = [] for chr, fusions in whole_fusions.items(): for fusion_region in fusions: returned_fusions.append( (fusion_region.annot["first"], fusion_region.annot["second"])) return returned_fusions
def testIterOverlapped_3(self): """Case where a subject is included in another.""" # Init test data sbjct_1 = Region(7, 10, "+", "chr1", "sbjct_1") sbjct_2 = Region(14, 20, "+", "chr1", "sbjct_2") sbjct_3 = Region(16, 18, "+", "chr1", "sbjct_3") sbjct_4 = Region(24, 29, "+", "chr1", "sbjct_4") subjects = RegionList([sbjct_1, sbjct_2, sbjct_3, sbjct_4]) queries_info = [ {"query": Region(11, 11, "+", "chr1", "query_l1_01"), "overlapped": []}, {"query": Region(12, 12, "+", "chr1", "query_l1_02"), "overlapped": []}, {"query": Region(13, 13, "+", "chr1", "query_l1_03"), "overlapped": []}, {"query": Region(14, 14, "+", "chr1", "query_l1_04"), "overlapped": [sbjct_2]}, {"query": Region(15, 15, "+", "chr1", "query_l1_05"), "overlapped": [sbjct_2]}, {"query": Region(16, 16, "+", "chr1", "query_l1_06"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 17, "+", "chr1", "query_l1_07"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 18, "+", "chr1", "query_l1_08"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(19, 19, "+", "chr1", "query_l1_09"), "overlapped": [sbjct_2]}, {"query": Region(20, 20, "+", "chr1", "query_l1_10"), "overlapped": [sbjct_2]}, {"query": Region(21, 21, "+", "chr1", "query_l1_11"), "overlapped": []}, {"query": Region(22, 22, "+", "chr1", "query_l1_12"), "overlapped": []}, {"query": Region(11, 13, "+", "chr1", "query_l3_01"), "overlapped": []}, {"query": Region(12, 14, "+", "chr1", "query_l3_02"), "overlapped": [sbjct_2]}, {"query": Region(13, 15, "+", "chr1", "query_l3_03"), "overlapped": [sbjct_2]}, {"query": Region(14, 16, "+", "chr1", "query_l3_04"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 17, "+", "chr1", "query_l3_05"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(16, 18, "+", "chr1", "query_l3_06"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 19, "+", "chr1", "query_l3_07"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 20, "+", "chr1", "query_l3_08"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(19, 21, "+", "chr1", "query_l3_09"), "overlapped": [sbjct_2]}, {"query": Region(20, 22, "+", "chr1", "query_l3_10"), "overlapped": [sbjct_2]}, {"query": Region(21, 23, "+", "chr1", "query_l3_11"), "overlapped": []}, {"query": Region(13, 17, "+", "chr1", "query_l5_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 19, "+", "chr1", "query_l5_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 21, "+", "chr1", "query_l5_03"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 22, "+", "chr1", "query_l5_04"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(19, 23, "+", "chr1", "query_l5_05"), "overlapped": [sbjct_2]}, {"query": Region(20, 24, "+", "chr1", "query_l5_06"), "overlapped": [sbjct_2, sbjct_4]}, {"query": Region(13, 18, "+", "chr1", "query_l6_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(14, 19, "+", "chr1", "query_l6_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 20, "+", "chr1", "query_l6_03"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(16, 21, "+", "chr1", "query_l6_04"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 22, "+", "chr1", "query_l6_05"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 23, "+", "chr1", "query_l6_06"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(19, 24, "+", "chr1", "query_l6_07"), "overlapped": [sbjct_2, sbjct_4]}, {"query": Region(13, 19, "+", "chr1", "query_l7_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(14, 20, "+", "chr1", "query_l7_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 21, "+", "chr1", "query_l7_03"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(16, 22, "+", "chr1", "query_l7_04"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(17, 23, "+", "chr1", "query_l7_05"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(18, 24, "+", "chr1", "query_l7_06"), "overlapped": [sbjct_2, sbjct_3, sbjct_4]}, {"query": Region(19, 24, "+", "chr1", "query_l7_07"), "overlapped": [sbjct_2, sbjct_4]}, {"query": Region(13, 20, "+", "chr1", "query_l8_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(14, 21, "+", "chr1", "query_l8_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(15, 22, "+", "chr1", "query_l8_03"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(13, 21, "+", "chr1", "query_l9_01"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(14, 22, "+", "chr1", "query_l9_02"), "overlapped": [sbjct_2, sbjct_3]}, {"query": Region(13, 22, "+", "chr1", "query_l10_01"), "overlapped": [sbjct_2, sbjct_3]} ] queries_info = sorted(queries_info, key=lambda x: (x["query"].start, x["query"].end)) # Independant evaluation for curr_eval in queries_info: obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped([curr_eval["query"]], subjects)] self.assertEqual(obs_overlapped, [curr_eval["overlapped"]]) # Grouped evaluation queries = [curr_info["query"] for curr_info in queries_info] expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info] obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, subjects)] self.assertEqual(obs_overlapped, expec_overlapped) # Grouped evaluation and inclusion between subjects starts the list of subjects shifted_subjects = subjects[1:] queries = [curr_info["query"] for curr_info in queries_info] expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info] obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, shifted_subjects)] self.assertEqual(obs_overlapped, expec_overlapped) # Grouped evaluation and inclusion between subjects ends the list of subjects poped_subjects = subjects[:-1] queries = [curr_info["query"] for curr_info in queries_info] expec_overlapped = [] for curr_info in queries_info: expec_overlapped.append([elt for elt in curr_info["overlapped"] if elt != sbjct_4]) obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, poped_subjects)] self.assertEqual(obs_overlapped, expec_overlapped)