コード例 #1
0
	def remove_duplicates(self,args):
		"""
		- First create a table of those that overlap
		- Then create merged entries based on the overlap matrix
		"""
		if(not self.genes_spanning_left_junction or not self.genes_spanning_right_junction):
			raise Exception("Gene annotations on dataset '"+self.name+"' were not found")
		else:
			old_count = len(self)
			if(self.name.find("vs.") == -1):
				self.logger.info("Duplication removal: "+self.name+" ("+str(old_count)+" fusions)")
		
		unique_fusions = []
		
		if(args.matching_method in ["overlap","subset","egm"]):
			from CompareFusionsBySpanningGenes import CompareFusionsBySpanningGenes
			overlap = CompareFusionsBySpanningGenes(False,False,args)
		else:
			raise Exception("Unknown overlap method for removing duplicates: '"+args.matching_method+"' for dataset "+self.name)
		
		stats_duplicates = 0
		stats_non_gene_spanning = 0
		
		fusions_to_add = []
		
		for chromosome_left in self.index.items():
			for chromosome_right in chromosome_left[1].items():
				
				all_fusions = chromosome_right[1]
				n = len(all_fusions)
				
				queue = range(n)
				while(len(queue) > 0):
					duplicates = []
					for i in queue:
						fusion_1 = all_fusions[i]
						if(fusion_1):
							is_duplicate = False
							if(len(fusion_1.get_annotated_genes_left()) == 0 or len(fusion_1.get_annotated_genes_right()) == 0):
								stats_non_gene_spanning += 1
								all_fusions[i] = False
							else:
								for j in range(i+1,n):
									fusion_2 = all_fusions[j]
									if(fusion_2):
										match = overlap.match_fusions(fusion_1,fusion_2,False)
										
										if(match):
											fusion_1 = match
											all_fusions[i] = match
											all_fusions[j] = False
											is_duplicate = True
								
								if(is_duplicate):
									duplicates.append(i)
								else:
									unique_fusions.append(fusion_1)
					queue = duplicates
				
				for fusion in all_fusions:
					if(fusion):
						fusions_to_add.append(fusion)
		
		self.flush()
		for fusion in fusions_to_add:
			self.add_fusion(fusion)
		
		if(self.name.find("vs.") == -1):
			self.logger.info("* Full: "+str(old_count))
			self.logger.info("* Gene-spanning: "+str(old_count-stats_non_gene_spanning))
			self.logger.info("* Unique: "+str(len(self)))
		
		return len(self)
コード例 #2
0
ファイル: OverlapComplex.py プロジェクト: xflicsu/fuma
    def overlay_fusions(self, sparse=True, export_dir=False, args=None):
        """
		The SPARSE variable should only be True if the outpot format
		is 'summary', because all the overlap objects are removed.
		This makes the algorithm much more effictent (reduces space
		complexity from 0.5(n^2) => 2n).
		"""
        n = len(self.datasets)

        self.logger.info("Determining the overlap of fusion genes in " +
                         str(n) + " datasets")

        self.matrix_tmp = {}

        for i in range(len(self.datasets)):
            self.matrix_tmp[str(i + 1)] = self.datasets[i]

        #comparisons = self.find_combination_table(n)
        if (args.format == "list" and export_dir != False):
            if args.long_gene_size > 0:
                large_genes = "Spans large gene (>" + str(
                    args.long_gene_size) + "bp)"
            else:
                large_genes = "Spans large gene (feature disabled)"

            export_dir.write("Left-genes\tRight-genes\t" + large_genes + "\t" +
                             "\t".join(self.dataset_names) + "\n")

        ri = 0
        for r in self.find_combination_table(len(self.datasets)):
            r_0 = self.find_combination_table_r_i(len(self.datasets), ri, 0)

            # First cleanup the memory - reduces space complexity from 0.5(n^2) => 2n. In addition, memory should decrease in time
            dont_remove = []
            matches_this_iteration = set([])

            #for c in r:
            #keys = self.create_keys(c)

            #dont_remove.append(keys[0])
            #dont_remove.append(keys[1])

            #if(args.format != "list"):
            #for candidate in self.matrix_tmp.keys():
            #if candidate not in dont_remove:
            #del(self.matrix_tmp[candidate])

            # Then run analysis
            for c in r:
                keys = self.create_keys(c)

                comparison = CompareFusionsBySpanningGenes(
                    self.matrix_tmp[keys[0]], self.matrix_tmp[keys[1]], args)
                matches = comparison.find_overlap()
                matches_this_iteration = matches_this_iteration | matches[3]

                if (not sparse and export_dir):
                    if (args.format == "extensive"):
                        matches[0].export_to_CG_Junctions_file(
                            export_dir + "/" + matches[0].name +
                            ".CG-junctions.txt")

                self.matrix_tmp[keys[2]] = matches[0]
                self.matches_total[keys[2]] = len(matches[0])

            if (
                    args.format == "list"
            ):  # Write those that are not marked to go to the next iteration to a file
                if (len(r_0) > 2):
                    for export_key in self.find_combination_table_r(
                            len(self.datasets),
                            ri - 1):  #previous_comparisons:#comparisons[ri-1]:
                        export_key = [str(x) for x in export_key]
                        export_key = '.'.join(export_key)

                        self.matrix_tmp[export_key].export_to_list(
                            export_dir, self.dataset_names,
                            matches_this_iteration, args)
                        del (
                            self.matrix_tmp[export_key]
                        )  ## if this was once in a list to be removed, remove...
                else:
                    for export_key in [
                            str(i + 1) for i in range(len(self.datasets))
                    ]:
                        self.matrix_tmp[export_key].export_to_list(
                            export_dir, self.dataset_names,
                            matches_this_iteration, args)
                        #del(self.matrix_tmp[export_key]) ## if this was once in a list to be removed, remove...

            ri += 1

        if (args.format == "list" and export_dir != False):
            export_key = '.'.join([str(x) for x in r_0])
            self.matrix_tmp[export_key].export_to_list(
                export_dir, self.dataset_names, set([]),
                args)  ## if this was once in a list to be removed, remove...?

        return matches
コード例 #3
0
ファイル: OverlapComplex.py プロジェクト: yhoogstrate/fuma
	def overlay_fusions(self,sparse=True,export_dir=False,args=None):
		"""
		The SPARSE variable should only be True if the outpot format
		is 'summary', because all the overlap objects are removed.
		This makes the algorithm much more effictent (reduces space
		complexity from 0.5(n^2) => 2n).
		"""
		n = len(self.datasets)
		
		self.logger.info("Determining the overlap of fusion genes in "+str(n)+" datasets")
		
		self.matrix_tmp = {}
		
		for i in range(len(self.datasets)):
			self.matrix_tmp[str(i+1)] = self.datasets[i]
		
		#comparisons = self.find_combination_table(n)
		if(args.format=="list" and export_dir != False):
			if args.long_gene_size > 0:
				large_genes = "Spans large gene (>"+str(args.long_gene_size)+"bp)"
			else:
				large_genes = "Spans large gene (feature disabled)"
			
			export_dir.write("Left-genes\tRight-genes\t"+large_genes+"\t"+"\t".join(self.dataset_names)+"\n")
		
		ri = 0
		for r in self.find_combination_table(len(self.datasets)):
			r_0 = self.find_combination_table_r_i(len(self.datasets),ri,0)
			
			# First cleanup the memory - reduces space complexity from 0.5(n^2) => 2n. In addition, memory should decrease in time
			dont_remove = []
			matches_this_iteration = set([])
			
			#for c in r:
				#keys = self.create_keys(c)
				
				#dont_remove.append(keys[0])
				#dont_remove.append(keys[1])
			
			#if(args.format != "list"):
				#for candidate in self.matrix_tmp.keys():
					#if candidate not in dont_remove:
						#del(self.matrix_tmp[candidate])
			
			# Then run analysis
			for c in r:
				keys = self.create_keys(c)
				
				comparison = CompareFusionsBySpanningGenes(self.matrix_tmp[keys[0]],self.matrix_tmp[keys[1]],args)
				matches = comparison.find_overlap()
				matches_this_iteration = matches_this_iteration | matches[3]
				
				if(not sparse and export_dir):
					if(args.format=="extensive"):
						matches[0].export_to_CG_Junctions_file(export_dir+"/"+matches[0].name+".CG-junctions.txt")
				
				self.matrix_tmp[keys[2]] = matches[0]
				self.matches_total[keys[2]] = len(matches[0])
			
			if(args.format=="list"):# Write those that are not marked to go to the next iteration to a file
				if(len(r_0) > 2):
					for export_key in self.find_combination_table_r(len(self.datasets),ri-1):#previous_comparisons:#comparisons[ri-1]:
						export_key = [str(x) for x in export_key]
						export_key = '.'.join(export_key)
						
						self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,matches_this_iteration,args)
						del(self.matrix_tmp[export_key]) ## if this was once in a list to be removed, remove...
				else:
					for export_key in [str(i+1) for i in range(len(self.datasets))]:
						self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,matches_this_iteration,args)
						#del(self.matrix_tmp[export_key]) ## if this was once in a list to be removed, remove...
			
			ri += 1
		
		if(args.format == "list" and export_dir != False):
			export_key = '.'.join([str(x) for x in r_0])
			self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,set([]),args) ## if this was once in a list to be removed, remove...?
		
		return matches
コード例 #4
0
    def remove_duplicates(self, args):
        """
		- First create a table of those that overlap
		- Then create merged entries based on the overlap matrix
		"""
        if (not self.genes_spanning_left_junction
                or not self.genes_spanning_right_junction):
            raise Exception("Gene annotations on dataset '" + self.name +
                            "' were not found")
        else:
            old_count = len(self)
            if (self.name.find("vs.") == -1):
                self.logger.info("Duplication removal: " + self.name + " (" +
                                 str(old_count) + " fusions)")

        unique_fusions = []

        if (args.matching_method in ["overlap", "subset", "egm"]):
            from CompareFusionsBySpanningGenes import CompareFusionsBySpanningGenes
            overlap = CompareFusionsBySpanningGenes(False, False, args)
        else:
            raise Exception(
                "Unknown overlap method for removing duplicates: '" +
                args.matching_method + "' for dataset " + self.name)

        stats_duplicates = 0
        stats_non_gene_spanning = 0

        fusions_to_add = []

        for chromosome_left in self.index.items():
            for chromosome_right in chromosome_left[1].items():

                all_fusions = chromosome_right[1]
                n = len(all_fusions)

                queue = range(n)
                while (len(queue) > 0):
                    duplicates = []
                    for i in queue:
                        fusion_1 = all_fusions[i]
                        if (fusion_1):
                            is_duplicate = False
                            if (len(fusion_1.get_annotated_genes_left(False))
                                    == 0 or len(
                                        fusion_1.get_annotated_genes_right(
                                            False)) == 0):
                                stats_non_gene_spanning += 1
                                all_fusions[i] = False
                            else:
                                for j in range(i + 1, n):
                                    fusion_2 = all_fusions[j]
                                    if (fusion_2):
                                        match = overlap.match_fusions(
                                            fusion_1, fusion_2, False)

                                        if (match):
                                            merged_matches = fusion_1.matches | fusion_2.matches

                                            fusion_1.matches = merged_matches
                                            fusion_1.acceptor_donor_direction = match.acceptor_donor_direction
                                            fusion_1.left_strand = match.left_strand
                                            fusion_1.right_strand = match.right_strand
                                            fusion_1.annotated_genes_left = match.annotated_genes_left
                                            fusion_1.annotated_genes_right = match.annotated_genes_right

                                            all_fusions[i] = fusion_1
                                            all_fusions[j] = False
                                            is_duplicate = True

                                            match.prepare_deletion()
                                            del (match)

                                if (is_duplicate):
                                    duplicates.append(i)
                                else:
                                    unique_fusions.append(fusion_1)
                    queue = duplicates

                for fusion in all_fusions:
                    if (fusion):
                        fusions_to_add.append(fusion)

        self.flush()
        for fusion in fusions_to_add:
            self.add_fusion(fusion)

        if (self.name.find("vs.") == -1):
            self.logger.debug("* Full: " + str(old_count))
            self.logger.debug("* Gene-spanning: " +
                              str(old_count - stats_non_gene_spanning))
            self.logger.debug("* Unique: " + str(len(self)))

        return len(self)