def remove_duplicates(self,args): """ - First create a table of those that overlap - Then create merged entries based on the overlap matrix """ if(not self.genes_spanning_left_junction or not self.genes_spanning_right_junction): raise Exception("Gene annotations on dataset '"+self.name+"' were not found") else: old_count = len(self) if(self.name.find("vs.") == -1): self.logger.info("Duplication removal: "+self.name+" ("+str(old_count)+" fusions)") unique_fusions = [] if(args.matching_method in ["overlap","subset","egm"]): from CompareFusionsBySpanningGenes import CompareFusionsBySpanningGenes overlap = CompareFusionsBySpanningGenes(False,False,args) else: raise Exception("Unknown overlap method for removing duplicates: '"+args.matching_method+"' for dataset "+self.name) stats_duplicates = 0 stats_non_gene_spanning = 0 fusions_to_add = [] for chromosome_left in self.index.items(): for chromosome_right in chromosome_left[1].items(): all_fusions = chromosome_right[1] n = len(all_fusions) queue = range(n) while(len(queue) > 0): duplicates = [] for i in queue: fusion_1 = all_fusions[i] if(fusion_1): is_duplicate = False if(len(fusion_1.get_annotated_genes_left()) == 0 or len(fusion_1.get_annotated_genes_right()) == 0): stats_non_gene_spanning += 1 all_fusions[i] = False else: for j in range(i+1,n): fusion_2 = all_fusions[j] if(fusion_2): match = overlap.match_fusions(fusion_1,fusion_2,False) if(match): fusion_1 = match all_fusions[i] = match all_fusions[j] = False is_duplicate = True if(is_duplicate): duplicates.append(i) else: unique_fusions.append(fusion_1) queue = duplicates for fusion in all_fusions: if(fusion): fusions_to_add.append(fusion) self.flush() for fusion in fusions_to_add: self.add_fusion(fusion) if(self.name.find("vs.") == -1): self.logger.info("* Full: "+str(old_count)) self.logger.info("* Gene-spanning: "+str(old_count-stats_non_gene_spanning)) self.logger.info("* Unique: "+str(len(self))) return len(self)
def remove_duplicates(self, args): """ - First create a table of those that overlap - Then create merged entries based on the overlap matrix """ if (not self.genes_spanning_left_junction or not self.genes_spanning_right_junction): raise Exception("Gene annotations on dataset '" + self.name + "' were not found") else: old_count = len(self) if (self.name.find("vs.") == -1): self.logger.info("Duplication removal: " + self.name + " (" + str(old_count) + " fusions)") unique_fusions = [] if (args.matching_method in ["overlap", "subset", "egm"]): from CompareFusionsBySpanningGenes import CompareFusionsBySpanningGenes overlap = CompareFusionsBySpanningGenes(False, False, args) else: raise Exception( "Unknown overlap method for removing duplicates: '" + args.matching_method + "' for dataset " + self.name) stats_duplicates = 0 stats_non_gene_spanning = 0 fusions_to_add = [] for chromosome_left in self.index.items(): for chromosome_right in chromosome_left[1].items(): all_fusions = chromosome_right[1] n = len(all_fusions) queue = range(n) while (len(queue) > 0): duplicates = [] for i in queue: fusion_1 = all_fusions[i] if (fusion_1): is_duplicate = False if (len(fusion_1.get_annotated_genes_left(False)) == 0 or len( fusion_1.get_annotated_genes_right( False)) == 0): stats_non_gene_spanning += 1 all_fusions[i] = False else: for j in range(i + 1, n): fusion_2 = all_fusions[j] if (fusion_2): match = overlap.match_fusions( fusion_1, fusion_2, False) if (match): merged_matches = fusion_1.matches | fusion_2.matches fusion_1.matches = merged_matches fusion_1.acceptor_donor_direction = match.acceptor_donor_direction fusion_1.left_strand = match.left_strand fusion_1.right_strand = match.right_strand fusion_1.annotated_genes_left = match.annotated_genes_left fusion_1.annotated_genes_right = match.annotated_genes_right all_fusions[i] = fusion_1 all_fusions[j] = False is_duplicate = True match.prepare_deletion() del (match) if (is_duplicate): duplicates.append(i) else: unique_fusions.append(fusion_1) queue = duplicates for fusion in all_fusions: if (fusion): fusions_to_add.append(fusion) self.flush() for fusion in fusions_to_add: self.add_fusion(fusion) if (self.name.find("vs.") == -1): self.logger.debug("* Full: " + str(old_count)) self.logger.debug("* Gene-spanning: " + str(old_count - stats_non_gene_spanning)) self.logger.debug("* Unique: " + str(len(self))) return len(self)