コード例 #1
0
	def remove_duplicates(self,args):
		"""
		- First create a table of those that overlap
		- Then create merged entries based on the overlap matrix
		"""
		if(not self.genes_spanning_left_junction or not self.genes_spanning_right_junction):
			raise Exception("Gene annotations on dataset '"+self.name+"' were not found")
		else:
			old_count = len(self)
			if(self.name.find("vs.") == -1):
				self.logger.info("Duplication removal: "+self.name+" ("+str(old_count)+" fusions)")
		
		unique_fusions = []
		
		if(args.matching_method in ["overlap","subset","egm"]):
			from CompareFusionsBySpanningGenes import CompareFusionsBySpanningGenes
			overlap = CompareFusionsBySpanningGenes(False,False,args)
		else:
			raise Exception("Unknown overlap method for removing duplicates: '"+args.matching_method+"' for dataset "+self.name)
		
		stats_duplicates = 0
		stats_non_gene_spanning = 0
		
		fusions_to_add = []
		
		for chromosome_left in self.index.items():
			for chromosome_right in chromosome_left[1].items():
				
				all_fusions = chromosome_right[1]
				n = len(all_fusions)
				
				queue = range(n)
				while(len(queue) > 0):
					duplicates = []
					for i in queue:
						fusion_1 = all_fusions[i]
						if(fusion_1):
							is_duplicate = False
							if(len(fusion_1.get_annotated_genes_left()) == 0 or len(fusion_1.get_annotated_genes_right()) == 0):
								stats_non_gene_spanning += 1
								all_fusions[i] = False
							else:
								for j in range(i+1,n):
									fusion_2 = all_fusions[j]
									if(fusion_2):
										match = overlap.match_fusions(fusion_1,fusion_2,False)
										
										if(match):
											fusion_1 = match
											all_fusions[i] = match
											all_fusions[j] = False
											is_duplicate = True
								
								if(is_duplicate):
									duplicates.append(i)
								else:
									unique_fusions.append(fusion_1)
					queue = duplicates
				
				for fusion in all_fusions:
					if(fusion):
						fusions_to_add.append(fusion)
		
		self.flush()
		for fusion in fusions_to_add:
			self.add_fusion(fusion)
		
		if(self.name.find("vs.") == -1):
			self.logger.info("* Full: "+str(old_count))
			self.logger.info("* Gene-spanning: "+str(old_count-stats_non_gene_spanning))
			self.logger.info("* Unique: "+str(len(self)))
		
		return len(self)
コード例 #2
0
    def remove_duplicates(self, args):
        """
		- First create a table of those that overlap
		- Then create merged entries based on the overlap matrix
		"""
        if (not self.genes_spanning_left_junction
                or not self.genes_spanning_right_junction):
            raise Exception("Gene annotations on dataset '" + self.name +
                            "' were not found")
        else:
            old_count = len(self)
            if (self.name.find("vs.") == -1):
                self.logger.info("Duplication removal: " + self.name + " (" +
                                 str(old_count) + " fusions)")

        unique_fusions = []

        if (args.matching_method in ["overlap", "subset", "egm"]):
            from CompareFusionsBySpanningGenes import CompareFusionsBySpanningGenes
            overlap = CompareFusionsBySpanningGenes(False, False, args)
        else:
            raise Exception(
                "Unknown overlap method for removing duplicates: '" +
                args.matching_method + "' for dataset " + self.name)

        stats_duplicates = 0
        stats_non_gene_spanning = 0

        fusions_to_add = []

        for chromosome_left in self.index.items():
            for chromosome_right in chromosome_left[1].items():

                all_fusions = chromosome_right[1]
                n = len(all_fusions)

                queue = range(n)
                while (len(queue) > 0):
                    duplicates = []
                    for i in queue:
                        fusion_1 = all_fusions[i]
                        if (fusion_1):
                            is_duplicate = False
                            if (len(fusion_1.get_annotated_genes_left(False))
                                    == 0 or len(
                                        fusion_1.get_annotated_genes_right(
                                            False)) == 0):
                                stats_non_gene_spanning += 1
                                all_fusions[i] = False
                            else:
                                for j in range(i + 1, n):
                                    fusion_2 = all_fusions[j]
                                    if (fusion_2):
                                        match = overlap.match_fusions(
                                            fusion_1, fusion_2, False)

                                        if (match):
                                            merged_matches = fusion_1.matches | fusion_2.matches

                                            fusion_1.matches = merged_matches
                                            fusion_1.acceptor_donor_direction = match.acceptor_donor_direction
                                            fusion_1.left_strand = match.left_strand
                                            fusion_1.right_strand = match.right_strand
                                            fusion_1.annotated_genes_left = match.annotated_genes_left
                                            fusion_1.annotated_genes_right = match.annotated_genes_right

                                            all_fusions[i] = fusion_1
                                            all_fusions[j] = False
                                            is_duplicate = True

                                            match.prepare_deletion()
                                            del (match)

                                if (is_duplicate):
                                    duplicates.append(i)
                                else:
                                    unique_fusions.append(fusion_1)
                    queue = duplicates

                for fusion in all_fusions:
                    if (fusion):
                        fusions_to_add.append(fusion)

        self.flush()
        for fusion in fusions_to_add:
            self.add_fusion(fusion)

        if (self.name.find("vs.") == -1):
            self.logger.debug("* Full: " + str(old_count))
            self.logger.debug("* Gene-spanning: " +
                              str(old_count - stats_non_gene_spanning))
            self.logger.debug("* Unique: " + str(len(self)))

        return len(self)