Ejemplo n.º 1
0
def decompose_multiple_alterations(reference_path, alternative_path, kmer_length):
	reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length)
	multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length)

	edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence)
	if len(edit_ops) > 2:
		logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence)
		logger.info("Globally apply %s", edit_ops)
	start, end = 0, 0
	while start < len(edit_ops):
		if edit_ops[start] == 'replace':
			atomic_sequence = Levenshtein.apply_edit([edit_ops[start]], reference_sequence, multi_alternative_sequence)
			# print atomic_sequence
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
			start += 1
		else:
			start_e = edit_ops[start]
			end = start + 1
			while (end < len(edit_ops)
				   and edit_ops[end][0] == start_e[0]
				   and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])):
				end += 1
			edit_op_to_apply = edit_ops[start:end]
			start = end
			logger.info("Will apply %s", edit_op_to_apply)
			atomic_sequence = Levenshtein.apply_edit(edit_op_to_apply, reference_sequence, multi_alternative_sequence)
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
		# record each atomic alteration
		logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence)
		yield atomic_sequence, atomic_path
Ejemplo n.º 2
0
def decompose_multiple_alterations(reference_path, alternative_path,
                                   kmer_length):
    reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length)
    multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path,
                                                   kmer_length)

    edit_ops = Levenshtein.editops(reference_sequence,
                                   multi_alternative_sequence)
    if len(edit_ops) > 2:
        logger.info("Multiple alt when considering ref %s vs alt %s",
                    reference_sequence, multi_alternative_sequence)
        logger.info("Globally apply %s", edit_ops)
    start, end = 0, 0
    while start < len(edit_ops):
        if edit_ops[start] == 'replace':
            atomic_sequence = Levenshtein.apply_edit(
                [edit_ops[start]], reference_sequence,
                multi_alternative_sequence)
            # print atomic_sequence
            atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
            start += 1
        else:
            start_e = edit_ops[start]
            end = start + 1
            while (end < len(edit_ops) and edit_ops[end][0] == start_e[0]
                   and (start_e[1] == edit_ops[end][1]
                        or start_e[2] == edit_ops[end][2])):
                end += 1
            edit_op_to_apply = edit_ops[start:end]
            start = end
            logger.info("Will apply %s", edit_op_to_apply)
            atomic_sequence = Levenshtein.apply_edit(
                edit_op_to_apply, reference_sequence,
                multi_alternative_sequence)
            atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
        # record each atomic alteration
        logger.info("Adding atomic alteration for ref %s vs alt %s",
                    reference_sequence, atomic_sequence)
        yield atomic_sequence, atomic_path
Ejemplo n.º 3
0
    def alteration_list_init(self, G_ref, kmer_length, min_support, max_len):
        self.alteration_list = []
        # Only nodes in dbg_refrm & G_ref and with in degree > 0 for end nodes and out degree > 0 for start nodes
        G_ref_nodes_set = set(G_ref.nodes())
        shared_nodes = list(set(self.dbg_refrm.nodes()) & G_ref_nodes_set)
        out_d = self.dbg_refrm.out_degree()
        in_d = self.dbg_refrm.in_degree()
        shared_nodes_start = [x for x in shared_nodes if out_d[x] > 0]
        shared_nodes_end = [x for x in shared_nodes if in_d[x] > 0]
        # Add tips end & start in shared_nodes_end & start
        out_degree_g_testclean_dict = self.dbgclean.out_degree()
        in_degree_g_testclean_dict = self.dbgclean.in_degree()
        out_degree_g_ref_dict = G_ref.out_degree()
        in_degree_g_ref_dict = G_ref.in_degree()
        end_tips_list = [
            key for key, v in self.dbgclean.out_degree().items()
            if out_degree_g_testclean_dict[key] == 0 and key not in G_ref
            and key in self.kmer_end_set
        ]
        start_tips_list = [
            key for key, v in self.dbgclean.in_degree().items()
            if in_degree_g_testclean_dict[key] == 0 and key not in G_ref
            and key in self.kmer_start_set
        ]
        shared_nodes_start.extend(start_tips_list)
        shared_nodes_end.extend(end_tips_list)
        # Search for alternative paths
        for node_start in shared_nodes_start:
            start_node = node_start
            for node_end in shared_nodes_end:
                end_node = node_end
                for alternative_path in nx.all_simple_paths(
                        self.dbg_refrm, node_start, node_end):
                    if len(set(alternative_path) & G_ref_nodes_set) > 2:
                        continue
                    # Compute coverage of the altenative path
                    total_coverage = max([
                        self.total_coverage_node(alt_nodes)
                        for alt_nodes in alternative_path
                    ])
                    # Read intersection of all nodes in the alt path for G_sample
                    read_set_pathAlt_G_sample = []
                    for node in alternative_path:
                        read_set_pathAlt_G_sample.append(
                            set(self.dbg_refrm.node[node]['read_list_n']))
                    intersect_allnodes_pathAlt_G_sample = set.intersection(
                        *read_set_pathAlt_G_sample)
                    if len(intersect_allnodes_pathAlt_G_sample
                           ) <= total_coverage * min_support / 100:
                        continue
                    # Reference path choice
                    # Replace start/end if it's a tips
                    if node_start not in G_ref:
                        logger.critical(
                            "The node %s (read support : %d) is a tip (start)",
                            node_start,
                            len(self.dbg_refrm.node[alternative_path[1]]
                                ['read_list_n']))
                        anchor = identify_anchor_kmer_in_reference_graph(
                            G_ref,
                            node_start,
                            rightmost=node_end,
                            path_length=len(alternative_path))
                        logger.critical("Node %s anchored to %s", node_start,
                                        anchor)
                        node_start = anchor

                    if node_end not in G_ref:
                        logger.critical(
                            "The node %s (read support : %d) is a tip (end)",
                            node_end,
                            len(self.dbg_refrm.node[alternative_path[1]]
                                ['read_list_n']))
                        anchor = identify_anchor_kmer_in_reference_graph(
                            G_ref,
                            node_start,
                            leftmost=node_start,
                            path_length=len(alternative_path))
                        logger.critical("Node %s anchored to %s", node_end,
                                        anchor)
                        node_end = anchor

                    reference_path_list = []
                    reference_path = ""
                    for i_path in nx.all_simple_paths(G_ref, node_start,
                                                      node_end):
                        reference_path_list.append(i_path)

                    if len(reference_path_list) == 0:
                        logger.critical("No reference path between %s and %s",
                                        node_start, node_end)
                        logger.critical("Alternative path : %s",
                                        alternative_path)
                        continue

                    # if there is multiple references paths, check the largest read intersection
                    # if read intersection are equal, the reference path is the one with the smaller delta size accordind to the alternative path
                    if len(reference_path_list) > 1:
                        logger.debug("Trying to identify actual reference")
                        reference_path = reference_path_list[0]
                        size_biggest_intersection = len(
                            list(set(alternative_path) & set(reference_path)))
                        logger.debug("Selected ref path num 0 with size %d",
                                     size_biggest_intersection)
                        for i_reference_path in range(
                                1, len(reference_path_list)):
                            curr_reference_path = reference_path_list[
                                i_reference_path]
                            size_intersection = len(
                                list(
                                    set(alternative_path)
                                    & set(curr_reference_path)))
                            if size_intersection > size_biggest_intersection:
                                size_biggest_intersection = size_intersection
                                reference_path = curr_reference_path
                                logger.debug(
                                    "Switching to ref path num %d with size %d",
                                    i_reference_path,
                                    size_biggest_intersection)
                            elif size_intersection == size_biggest_intersection:
                                size_reference_path = len(reference_path)
                                size_curr_reference_path = len(
                                    curr_reference_path)
                                size_alternative_path = len(alternative_path)
                                delta_1 = abs(size_reference_path -
                                              size_alternative_path)
                                delta_2 = abs(size_curr_reference_path -
                                              size_alternative_path)
                                if delta_2 < delta_1:
                                    size_biggest_intersection = size_intersection
                                    reference_path = curr_reference_path
                                    logger.debug(
                                        "Switching to ref path num %d with size %d and deltas: %d--%d ",
                                        i_reference_path,
                                        size_biggest_intersection, delta_2,
                                        delta_1)
                        assert reference_path
                        assert size_biggest_intersection
                    else:
                        reference_path = reference_path_list[0]
                    # Read intersection of all nodes in the reference path for g_patient
                    condition = 0
                    read_set_pathRef_G_sample = []
                    for node in reference_path:
                        if node not in self.dbg:
                            condition = 1
                            logger.critical(
                                "Identified node %s absent from the input DBG",
                                node)
                            intersect_allnodes_pathRef_G_sample = "0"  # Weird smoothing, TODO check with justine if required
                            # intersect_allnodes_pathRef_G_sample = []
                            break
                        read_set_pathRef_G_sample.append(
                            set(self.dbg.node[node]['read_list_n']))
                    if condition == 0:
                        intersect_allnodes_pathRef_G_sample = set.intersection(
                            *read_set_pathRef_G_sample)
                    if abs(len(reference_path) -
                           len(alternative_path)) > max_len:
                        logger.critical(
                            "Disregarding large alteration %s vs %s",
                            reference_path, alternative_path)
                        continue

                    reference_sequence = ALT.kmerpathToSeq(
                        reference_path, kmer_length)
                    # Decompose path if it is multiple
                    for atomic_sequence, atomic_path in decompose_multiple_alterations(
                            reference_path, alternative_path, kmer_length):
                        self.alteration_list.append(
                            ALT(
                                reference_path, atomic_path,
                                reference_sequence, atomic_sequence,
                                len(intersect_allnodes_pathRef_G_sample),
                                len(intersect_allnodes_pathAlt_G_sample),
                                kmer_length,
                                max(self.total_coverage_node(node_start),
                                    self.total_coverage_node(node_end)) *
                                min_support / 100))

                # Replace start/end if it was a tips
                node_end = end_node
                node_start = start_node
Ejemplo n.º 4
0
	def alteration_list_init(self, G_ref, k, alpha):
		self.alteration_list = []
		# Only nodes in dbg_refrm & G_ref and with in degree > 0 for end nodes and out degree > 0 for start nodes  
		G_ref_nodes_set = set(G_ref.nodes())
		shared_nodes = list(set(self.dbg_refrm.nodes()) & G_ref_nodes_set)
		out_d = self.dbg_refrm.out_degree()
		in_d = self.dbg_refrm.in_degree()
		shared_nodes_start = [x for x in shared_nodes if out_d[x] > 0]
		shared_nodes_end = [x for x in shared_nodes if in_d[x] > 0]
		# Add tips end & start in shared_nodes_end & start
		out_degree_g_testclean_dict = self.dbgclean.out_degree()
		in_degree_g_testclean_dict = self.dbgclean.in_degree()	
		out_degree_g_ref_dict = G_ref.out_degree()
		in_degree_g_ref_dict = G_ref.in_degree()
		start_g_ref = [key for key, v in G_ref.in_degree().items() if in_degree_g_ref_dict[key] == 0][0] # only one in TP53
		end_g_ref = [key for key, v in G_ref.out_degree().items() if out_degree_g_ref_dict[key] == 0][0] # only one in TP53
		end_tips_list = [key for key, v in self.dbgclean.out_degree().items() if out_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_end_set]
		start_tips_list = [key for key, v in self.dbgclean.in_degree().items() if in_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_start_set]
		shared_nodes_start.extend(start_tips_list)
		shared_nodes_end.extend(end_tips_list)
		for node_start in shared_nodes_start:
			start_node = node_start
			for node_end in shared_nodes_end:
				end_node = node_end
				for alternative_path in nx.all_simple_paths(self.dbg_refrm, node_start, node_end):
					if len(set(alternative_path) & G_ref_nodes_set) > 2:
						continue
					# Read intersection of all nodes in the alt path for G_sample 
					read_set_pathAlt_G_sample = []
					for node in alternative_path:
						read_set_pathAlt_G_sample.append(set(self.dbg_refrm.node[node]['read_list_n']))
					intersect_allnodes_pathAlt_G_sample = set.intersection(*read_set_pathAlt_G_sample)
					if len(intersect_allnodes_pathAlt_G_sample) == 0:
						# logger.critical("No read on path %s to(ref list : %s read support : %d) and %s (ref list : %s read support : %d)",node_start,str(G_ref.node[node_start]['ref_list']),len(self.dbg_refrm.node[alternative_path[1]]['read_list_n']),node_end,G_ref.node[node_end]['ref_list'],len(self.dbg_refrm.node[alternative_path[len(alternative_path)-2]]['read_list_n']))
						continue
					## Reference path choice
					# Replace start/end if it's a tips
					if node_start not in G_ref:
						logger.critical("The node %s (read support : %d) is a tips(start)",node_start,len(self.dbg_refrm.node[alternative_path[1]]['read_list_n']))
						node_start = start_g_ref
					reference_path_list = []
					if node_end not in G_ref:
						logger.critical("The node %s (read support : %d) is a tips(end)",node_end,len(self.dbg_refrm.node[alternative_path[1]]['read_list_n']))
						node_end = end_g_ref
					reference_path_list = []
					for i_path in nx.all_simple_paths(G_ref, node_start, node_end):
						reference_path_list.append(i_path)
					# if there is no reference path, check predecessors/successors of start/end nodes of the path (just +1 at this moment)
					if len(reference_path_list) == 0:
						reference_path_list_successor = [] 
						reference_path_list_predecessor = [] 
						for successor in G_ref.successors(node_end):
							for i_path_successor in nx.all_simple_paths(G_ref, node_start ,successor):
								reference_path_list_successor.append(i_path_successor)
							if len(reference_path_list_successor) > 0:
								logger.critical("Successor is add to the reference and alternative path between %s (ref list : %s) and %s (ref list : %s)",node_start,str(G_ref.node[node_start]['ref_list']),node_end,G_ref.node[node_end]['ref_list'])										
								alternative_path.append(successor)
								node_end = successor
								reference_path_list = reference_path_list_successor
								break
						for predecessor in G_ref.predecessors(node_start):
							for i_path_predecessor in nx.all_simple_paths(G_ref, predecessor, node_end):
								reference_path_list_predecessor.append(i_path_predecessor)
							if len(reference_path_list_predecessor) > 0:
								logger.critical("Predecessor is add to the reference and alternative path between %s (ref list : %s) and %s (ref list : %s)",node_start,str(G_ref.node[node_start]['ref_list']),node_end,G_ref.node[node_end]['ref_list'])										
								alternative_path.insert(0,predecessor)
								node_start = predecessor
								reference_path_list = reference_path_list_predecessor
								break
						if len(reference_path_list_predecessor) == 0 and len(reference_path_list_successor) == 0:
							logger.critical("No reference path between %s (ref list : %s) and %s (ref list : %s)",node_start,str(G_ref.node[node_start]['ref_list']),node_end,G_ref.node[node_end]['ref_list'])						
							logger.critical("Alternative path : %s",alternative_path)
							continue
					if len(reference_path_list) > 1 :
						alignment_score = 0
						alternative_sequence = ALT.kmerpathToSeq(alternative_path,k)
						for i_reference_path in range(0,len(reference_path_list)):
							reference_sequence = ALT.kmerpathToSeq(reference_path_list[i_reference_path],k)
							alignment = sw.align(alternative_sequence,reference_sequence)
							if alignment.score > alignment_score:
								alignment_score = alignment.score
								reference_path = reference_path_list[i_reference_path]
							elif alignment.score == alignment_score:
								# faire un set des ref_list de tous les noeuds et conserver le path de référence qui est de taille minimum
								# ref_list_check = lambda x: set(G_ref.node[x]['ref_list'].keys())
								old_ref_list_set = []
								new_ref_list_set = []
								for node2check in reference_path:
									old_ref_list_set += G_ref.node[node2check]['ref_list'].keys()
									# old_ref_list_set.add(ref_list_check(node2check))
								for node2check in reference_path_list[i_reference_path]:
									new_ref_list_set += G_ref.node[node2check]['ref_list'].keys()
									# new_ref_list_set.add(ref_list_check(node2check))
								if len(old_ref_list_set) > len(new_ref_list_set):
									reference_path = reference_path_list[i_reference_path]
								elif len(old_ref_list_set) == len(new_ref_list_set):
								 	logger.critical("Same et size of reference paths")
					else:
						reference_path = reference_path_list[0]
					# Read intersection of all nodes in the reference path for G_sample 
					condition = 0
					read_set_pathRef_G_sample = []
					for node in reference_path:
						if node not in self.dbg:
								# print ("path de référence non représenté dans GDB individu")
							condition = 1 
							intersect_allnodes_pathRef_G_sample = "0"
							break
						read_set_pathRef_G_sample.append(set(self.dbg.node[node]['read_list_n']))
					if condition == 0:
						intersect_allnodes_pathRef_G_sample = set.intersection(*read_set_pathRef_G_sample)
					self.alteration_list.append(ALT(reference_path, alternative_path, len(intersect_allnodes_pathRef_G_sample), len(intersect_allnodes_pathAlt_G_sample), k,max(self.total_coverage_node(node_start),self.total_coverage_node(node_end))*alpha/100))
				# Replace start/end if it was a tips
				node_end = end_node
				node_start = start_node
Ejemplo n.º 5
0
	def alteration_list_init(self, G_ref, kmer_length, min_support, max_len):
		self.alteration_list = []
		# Only nodes in dbg_refrm & G_ref and with in degree > 0 for end nodes and out degree > 0 for start nodes  
		G_ref_nodes_set = set(G_ref.nodes())
		shared_nodes = list(set(self.dbg_refrm.nodes()) & G_ref_nodes_set)
		out_d = self.dbg_refrm.out_degree()
		in_d = self.dbg_refrm.in_degree()
		shared_nodes_start = [x for x in shared_nodes if out_d[x] > 0]
		shared_nodes_end = [x for x in shared_nodes if in_d[x] > 0]
		# Add tips end & start in shared_nodes_end & start
		out_degree_g_testclean_dict = self.dbgclean.out_degree()
		in_degree_g_testclean_dict = self.dbgclean.in_degree()
		out_degree_g_ref_dict = G_ref.out_degree()
		in_degree_g_ref_dict = G_ref.in_degree()
		end_tips_list = [key for key, v in self.dbgclean.out_degree().items() if
						 out_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_end_set]
		start_tips_list = [key for key, v in self.dbgclean.in_degree().items() if
						   in_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_start_set]
		shared_nodes_start.extend(start_tips_list)
		shared_nodes_end.extend(end_tips_list)
		# Search for alternative paths
		for node_start in shared_nodes_start:
			start_node = node_start
			for node_end in shared_nodes_end:
				end_node = node_end
				for alternative_path in nx.all_simple_paths(self.dbg_refrm, node_start, node_end):
					if len(set(alternative_path) & G_ref_nodes_set) > 2:
						continue
					# Compute coverage of the altenative path
					total_coverage = max([self.total_coverage_node(alt_nodes) for alt_nodes in alternative_path])
					# Read intersection of all nodes in the alt path for G_sample 
					read_set_pathAlt_G_sample = []
					for node in alternative_path:
						read_set_pathAlt_G_sample.append(set(self.dbg_refrm.node[node]['read_list_n']))
					intersect_allnodes_pathAlt_G_sample = set.intersection(*read_set_pathAlt_G_sample)
					if len(intersect_allnodes_pathAlt_G_sample) <= total_coverage * min_support / 100:
						continue
					# Reference path choice
					# Replace start/end if it's a tips
					if node_start not in G_ref:
						logger.critical("The node %s (read support : %d) is a tip (start)", node_start,
										len(self.dbg_refrm.node[alternative_path[1]]['read_list_n']))
						anchor = identify_anchor_kmer_in_reference_graph(G_ref, node_start, rightmost=node_end,
																		 path_length=len(alternative_path))
						logger.critical("Node %s anchored to %s", node_start, anchor)
						node_start = anchor

					if node_end not in G_ref:
						logger.critical("The node %s (read support : %d) is a tip (end)", node_end,
										len(self.dbg_refrm.node[alternative_path[1]]['read_list_n']))
						anchor = identify_anchor_kmer_in_reference_graph(G_ref, node_start, leftmost=node_start,
																		 path_length=len(alternative_path))
						logger.critical("Node %s anchored to %s", node_end, anchor)
						node_end = anchor

					reference_path_list = []
					reference_path = ""
					for i_path in nx.all_simple_paths(G_ref, node_start, node_end):
						reference_path_list.append(i_path)

					if len(reference_path_list) == 0:
						logger.critical("No reference path between %s and %s", node_start, node_end)
						logger.critical("Alternative path : %s", alternative_path)
						continue

					# if there is multiple references paths, check the largest read intersection 
					# if read intersection are equal, the reference path is the one with the smaller delta size accordind to the alternative path
					if len(reference_path_list) > 1:
						logger.debug("Trying to identify actual reference")
						reference_path = reference_path_list[0]
						size_biggest_intersection = len(list(set(alternative_path) & set(reference_path)))
						logger.debug("Selected ref path num 0 with size %d", size_biggest_intersection)
						for i_reference_path in range(1, len(reference_path_list)):
							curr_reference_path = reference_path_list[i_reference_path]
							size_intersection = len(list(set(alternative_path) & set(curr_reference_path)))
							if size_intersection > size_biggest_intersection:
								size_biggest_intersection = size_intersection
								reference_path = curr_reference_path
								logger.debug("Switching to ref path num %d with size %d", i_reference_path, size_biggest_intersection)
							elif size_intersection == size_biggest_intersection:
								size_reference_path = len(reference_path)
								size_curr_reference_path = len(curr_reference_path)
								size_alternative_path = len(alternative_path)
								delta_1 = abs(size_reference_path - size_alternative_path)
								delta_2 = abs(size_curr_reference_path - size_alternative_path)
								if delta_2 < delta_1:
									size_biggest_intersection = size_intersection
									reference_path = curr_reference_path
									logger.debug("Switching to ref path num %d with size %d and deltas: %d--%d ", i_reference_path,
												 size_biggest_intersection, delta_2, delta_1)
						assert reference_path
						assert size_biggest_intersection
					else:
						reference_path = reference_path_list[0]
					# Read intersection of all nodes in the reference path for g_patient 
					condition = 0
					read_set_pathRef_G_sample = []
					for node in reference_path:
						if node not in self.dbg:
							condition = 1
							logger.critical("Identified node %s absent from the input DBG", node)
							intersect_allnodes_pathRef_G_sample = "0"  # Weird smoothing, TODO check with justine if required
							# intersect_allnodes_pathRef_G_sample = []
							break
						read_set_pathRef_G_sample.append(set(self.dbg.node[node]['read_list_n']))
					if condition == 0:
						intersect_allnodes_pathRef_G_sample = set.intersection(*read_set_pathRef_G_sample)
					if abs(len(reference_path) - len(alternative_path)) > max_len:
						logger.critical("Disregarding large alteration %s vs %s", reference_path, alternative_path)
						continue

					reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length)
					# Decompose path if it is multiple
					for atomic_sequence, atomic_path in decompose_multiple_alterations(reference_path, alternative_path, kmer_length):
						self.alteration_list.append(ALT(reference_path, atomic_path, reference_sequence, atomic_sequence,
														len(intersect_allnodes_pathRef_G_sample),
														len(intersect_allnodes_pathAlt_G_sample), kmer_length,
														max(self.total_coverage_node(node_start),
															self.total_coverage_node(node_end)) * min_support / 100))

				# Replace start/end if it was a tips
				node_end = end_node
				node_start = start_node