def decompose_multiple_alterations(reference_path, alternative_path, kmer_length): reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length) multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length) edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence) if len(edit_ops) > 2: logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence) logger.info("Globally apply %s", edit_ops) start, end = 0, 0 while start < len(edit_ops): if edit_ops[start] == 'replace': atomic_sequence = Levenshtein.apply_edit([edit_ops[start]], reference_sequence, multi_alternative_sequence) # print atomic_sequence atomic_path = ALT.kmerize(atomic_sequence, kmer_length) start += 1 else: start_e = edit_ops[start] end = start + 1 while (end < len(edit_ops) and edit_ops[end][0] == start_e[0] and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])): end += 1 edit_op_to_apply = edit_ops[start:end] start = end logger.info("Will apply %s", edit_op_to_apply) atomic_sequence = Levenshtein.apply_edit(edit_op_to_apply, reference_sequence, multi_alternative_sequence) atomic_path = ALT.kmerize(atomic_sequence, kmer_length) # record each atomic alteration logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence) yield atomic_sequence, atomic_path
def decompose_multiple_alterations(reference_path, alternative_path, kmer_length): reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length) multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length) edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence) if len(edit_ops) > 2: logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence) logger.info("Globally apply %s", edit_ops) start, end = 0, 0 while start < len(edit_ops): if edit_ops[start] == 'replace': atomic_sequence = Levenshtein.apply_edit( [edit_ops[start]], reference_sequence, multi_alternative_sequence) # print atomic_sequence atomic_path = ALT.kmerize(atomic_sequence, kmer_length) start += 1 else: start_e = edit_ops[start] end = start + 1 while (end < len(edit_ops) and edit_ops[end][0] == start_e[0] and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])): end += 1 edit_op_to_apply = edit_ops[start:end] start = end logger.info("Will apply %s", edit_op_to_apply) atomic_sequence = Levenshtein.apply_edit( edit_op_to_apply, reference_sequence, multi_alternative_sequence) atomic_path = ALT.kmerize(atomic_sequence, kmer_length) # record each atomic alteration logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence) yield atomic_sequence, atomic_path
def alteration_list_init(self, G_ref, kmer_length, min_support, max_len): self.alteration_list = [] # Only nodes in dbg_refrm & G_ref and with in degree > 0 for end nodes and out degree > 0 for start nodes G_ref_nodes_set = set(G_ref.nodes()) shared_nodes = list(set(self.dbg_refrm.nodes()) & G_ref_nodes_set) out_d = self.dbg_refrm.out_degree() in_d = self.dbg_refrm.in_degree() shared_nodes_start = [x for x in shared_nodes if out_d[x] > 0] shared_nodes_end = [x for x in shared_nodes if in_d[x] > 0] # Add tips end & start in shared_nodes_end & start out_degree_g_testclean_dict = self.dbgclean.out_degree() in_degree_g_testclean_dict = self.dbgclean.in_degree() out_degree_g_ref_dict = G_ref.out_degree() in_degree_g_ref_dict = G_ref.in_degree() end_tips_list = [ key for key, v in self.dbgclean.out_degree().items() if out_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_end_set ] start_tips_list = [ key for key, v in self.dbgclean.in_degree().items() if in_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_start_set ] shared_nodes_start.extend(start_tips_list) shared_nodes_end.extend(end_tips_list) # Search for alternative paths for node_start in shared_nodes_start: start_node = node_start for node_end in shared_nodes_end: end_node = node_end for alternative_path in nx.all_simple_paths( self.dbg_refrm, node_start, node_end): if len(set(alternative_path) & G_ref_nodes_set) > 2: continue # Compute coverage of the altenative path total_coverage = max([ self.total_coverage_node(alt_nodes) for alt_nodes in alternative_path ]) # Read intersection of all nodes in the alt path for G_sample read_set_pathAlt_G_sample = [] for node in alternative_path: read_set_pathAlt_G_sample.append( set(self.dbg_refrm.node[node]['read_list_n'])) intersect_allnodes_pathAlt_G_sample = set.intersection( *read_set_pathAlt_G_sample) if len(intersect_allnodes_pathAlt_G_sample ) <= total_coverage * min_support / 100: continue # Reference path choice # Replace start/end if it's a tips if node_start not in G_ref: logger.critical( "The node %s (read support : %d) is a tip (start)", node_start, len(self.dbg_refrm.node[alternative_path[1]] ['read_list_n'])) anchor = identify_anchor_kmer_in_reference_graph( G_ref, node_start, rightmost=node_end, path_length=len(alternative_path)) logger.critical("Node %s anchored to %s", node_start, anchor) node_start = anchor if node_end not in G_ref: logger.critical( "The node %s (read support : %d) is a tip (end)", node_end, len(self.dbg_refrm.node[alternative_path[1]] ['read_list_n'])) anchor = identify_anchor_kmer_in_reference_graph( G_ref, node_start, leftmost=node_start, path_length=len(alternative_path)) logger.critical("Node %s anchored to %s", node_end, anchor) node_end = anchor reference_path_list = [] reference_path = "" for i_path in nx.all_simple_paths(G_ref, node_start, node_end): reference_path_list.append(i_path) if len(reference_path_list) == 0: logger.critical("No reference path between %s and %s", node_start, node_end) logger.critical("Alternative path : %s", alternative_path) continue # if there is multiple references paths, check the largest read intersection # if read intersection are equal, the reference path is the one with the smaller delta size accordind to the alternative path if len(reference_path_list) > 1: logger.debug("Trying to identify actual reference") reference_path = reference_path_list[0] size_biggest_intersection = len( list(set(alternative_path) & set(reference_path))) logger.debug("Selected ref path num 0 with size %d", size_biggest_intersection) for i_reference_path in range( 1, len(reference_path_list)): curr_reference_path = reference_path_list[ i_reference_path] size_intersection = len( list( set(alternative_path) & set(curr_reference_path))) if size_intersection > size_biggest_intersection: size_biggest_intersection = size_intersection reference_path = curr_reference_path logger.debug( "Switching to ref path num %d with size %d", i_reference_path, size_biggest_intersection) elif size_intersection == size_biggest_intersection: size_reference_path = len(reference_path) size_curr_reference_path = len( curr_reference_path) size_alternative_path = len(alternative_path) delta_1 = abs(size_reference_path - size_alternative_path) delta_2 = abs(size_curr_reference_path - size_alternative_path) if delta_2 < delta_1: size_biggest_intersection = size_intersection reference_path = curr_reference_path logger.debug( "Switching to ref path num %d with size %d and deltas: %d--%d ", i_reference_path, size_biggest_intersection, delta_2, delta_1) assert reference_path assert size_biggest_intersection else: reference_path = reference_path_list[0] # Read intersection of all nodes in the reference path for g_patient condition = 0 read_set_pathRef_G_sample = [] for node in reference_path: if node not in self.dbg: condition = 1 logger.critical( "Identified node %s absent from the input DBG", node) intersect_allnodes_pathRef_G_sample = "0" # Weird smoothing, TODO check with justine if required # intersect_allnodes_pathRef_G_sample = [] break read_set_pathRef_G_sample.append( set(self.dbg.node[node]['read_list_n'])) if condition == 0: intersect_allnodes_pathRef_G_sample = set.intersection( *read_set_pathRef_G_sample) if abs(len(reference_path) - len(alternative_path)) > max_len: logger.critical( "Disregarding large alteration %s vs %s", reference_path, alternative_path) continue reference_sequence = ALT.kmerpathToSeq( reference_path, kmer_length) # Decompose path if it is multiple for atomic_sequence, atomic_path in decompose_multiple_alterations( reference_path, alternative_path, kmer_length): self.alteration_list.append( ALT( reference_path, atomic_path, reference_sequence, atomic_sequence, len(intersect_allnodes_pathRef_G_sample), len(intersect_allnodes_pathAlt_G_sample), kmer_length, max(self.total_coverage_node(node_start), self.total_coverage_node(node_end)) * min_support / 100)) # Replace start/end if it was a tips node_end = end_node node_start = start_node
def alteration_list_init(self, G_ref, k, alpha): self.alteration_list = [] # Only nodes in dbg_refrm & G_ref and with in degree > 0 for end nodes and out degree > 0 for start nodes G_ref_nodes_set = set(G_ref.nodes()) shared_nodes = list(set(self.dbg_refrm.nodes()) & G_ref_nodes_set) out_d = self.dbg_refrm.out_degree() in_d = self.dbg_refrm.in_degree() shared_nodes_start = [x for x in shared_nodes if out_d[x] > 0] shared_nodes_end = [x for x in shared_nodes if in_d[x] > 0] # Add tips end & start in shared_nodes_end & start out_degree_g_testclean_dict = self.dbgclean.out_degree() in_degree_g_testclean_dict = self.dbgclean.in_degree() out_degree_g_ref_dict = G_ref.out_degree() in_degree_g_ref_dict = G_ref.in_degree() start_g_ref = [key for key, v in G_ref.in_degree().items() if in_degree_g_ref_dict[key] == 0][0] # only one in TP53 end_g_ref = [key for key, v in G_ref.out_degree().items() if out_degree_g_ref_dict[key] == 0][0] # only one in TP53 end_tips_list = [key for key, v in self.dbgclean.out_degree().items() if out_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_end_set] start_tips_list = [key for key, v in self.dbgclean.in_degree().items() if in_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_start_set] shared_nodes_start.extend(start_tips_list) shared_nodes_end.extend(end_tips_list) for node_start in shared_nodes_start: start_node = node_start for node_end in shared_nodes_end: end_node = node_end for alternative_path in nx.all_simple_paths(self.dbg_refrm, node_start, node_end): if len(set(alternative_path) & G_ref_nodes_set) > 2: continue # Read intersection of all nodes in the alt path for G_sample read_set_pathAlt_G_sample = [] for node in alternative_path: read_set_pathAlt_G_sample.append(set(self.dbg_refrm.node[node]['read_list_n'])) intersect_allnodes_pathAlt_G_sample = set.intersection(*read_set_pathAlt_G_sample) if len(intersect_allnodes_pathAlt_G_sample) == 0: # logger.critical("No read on path %s to(ref list : %s read support : %d) and %s (ref list : %s read support : %d)",node_start,str(G_ref.node[node_start]['ref_list']),len(self.dbg_refrm.node[alternative_path[1]]['read_list_n']),node_end,G_ref.node[node_end]['ref_list'],len(self.dbg_refrm.node[alternative_path[len(alternative_path)-2]]['read_list_n'])) continue ## Reference path choice # Replace start/end if it's a tips if node_start not in G_ref: logger.critical("The node %s (read support : %d) is a tips(start)",node_start,len(self.dbg_refrm.node[alternative_path[1]]['read_list_n'])) node_start = start_g_ref reference_path_list = [] if node_end not in G_ref: logger.critical("The node %s (read support : %d) is a tips(end)",node_end,len(self.dbg_refrm.node[alternative_path[1]]['read_list_n'])) node_end = end_g_ref reference_path_list = [] for i_path in nx.all_simple_paths(G_ref, node_start, node_end): reference_path_list.append(i_path) # if there is no reference path, check predecessors/successors of start/end nodes of the path (just +1 at this moment) if len(reference_path_list) == 0: reference_path_list_successor = [] reference_path_list_predecessor = [] for successor in G_ref.successors(node_end): for i_path_successor in nx.all_simple_paths(G_ref, node_start ,successor): reference_path_list_successor.append(i_path_successor) if len(reference_path_list_successor) > 0: logger.critical("Successor is add to the reference and alternative path between %s (ref list : %s) and %s (ref list : %s)",node_start,str(G_ref.node[node_start]['ref_list']),node_end,G_ref.node[node_end]['ref_list']) alternative_path.append(successor) node_end = successor reference_path_list = reference_path_list_successor break for predecessor in G_ref.predecessors(node_start): for i_path_predecessor in nx.all_simple_paths(G_ref, predecessor, node_end): reference_path_list_predecessor.append(i_path_predecessor) if len(reference_path_list_predecessor) > 0: logger.critical("Predecessor is add to the reference and alternative path between %s (ref list : %s) and %s (ref list : %s)",node_start,str(G_ref.node[node_start]['ref_list']),node_end,G_ref.node[node_end]['ref_list']) alternative_path.insert(0,predecessor) node_start = predecessor reference_path_list = reference_path_list_predecessor break if len(reference_path_list_predecessor) == 0 and len(reference_path_list_successor) == 0: logger.critical("No reference path between %s (ref list : %s) and %s (ref list : %s)",node_start,str(G_ref.node[node_start]['ref_list']),node_end,G_ref.node[node_end]['ref_list']) logger.critical("Alternative path : %s",alternative_path) continue if len(reference_path_list) > 1 : alignment_score = 0 alternative_sequence = ALT.kmerpathToSeq(alternative_path,k) for i_reference_path in range(0,len(reference_path_list)): reference_sequence = ALT.kmerpathToSeq(reference_path_list[i_reference_path],k) alignment = sw.align(alternative_sequence,reference_sequence) if alignment.score > alignment_score: alignment_score = alignment.score reference_path = reference_path_list[i_reference_path] elif alignment.score == alignment_score: # faire un set des ref_list de tous les noeuds et conserver le path de référence qui est de taille minimum # ref_list_check = lambda x: set(G_ref.node[x]['ref_list'].keys()) old_ref_list_set = [] new_ref_list_set = [] for node2check in reference_path: old_ref_list_set += G_ref.node[node2check]['ref_list'].keys() # old_ref_list_set.add(ref_list_check(node2check)) for node2check in reference_path_list[i_reference_path]: new_ref_list_set += G_ref.node[node2check]['ref_list'].keys() # new_ref_list_set.add(ref_list_check(node2check)) if len(old_ref_list_set) > len(new_ref_list_set): reference_path = reference_path_list[i_reference_path] elif len(old_ref_list_set) == len(new_ref_list_set): logger.critical("Same et size of reference paths") else: reference_path = reference_path_list[0] # Read intersection of all nodes in the reference path for G_sample condition = 0 read_set_pathRef_G_sample = [] for node in reference_path: if node not in self.dbg: # print ("path de référence non représenté dans GDB individu") condition = 1 intersect_allnodes_pathRef_G_sample = "0" break read_set_pathRef_G_sample.append(set(self.dbg.node[node]['read_list_n'])) if condition == 0: intersect_allnodes_pathRef_G_sample = set.intersection(*read_set_pathRef_G_sample) self.alteration_list.append(ALT(reference_path, alternative_path, len(intersect_allnodes_pathRef_G_sample), len(intersect_allnodes_pathAlt_G_sample), k,max(self.total_coverage_node(node_start),self.total_coverage_node(node_end))*alpha/100)) # Replace start/end if it was a tips node_end = end_node node_start = start_node
def alteration_list_init(self, G_ref, kmer_length, min_support, max_len): self.alteration_list = [] # Only nodes in dbg_refrm & G_ref and with in degree > 0 for end nodes and out degree > 0 for start nodes G_ref_nodes_set = set(G_ref.nodes()) shared_nodes = list(set(self.dbg_refrm.nodes()) & G_ref_nodes_set) out_d = self.dbg_refrm.out_degree() in_d = self.dbg_refrm.in_degree() shared_nodes_start = [x for x in shared_nodes if out_d[x] > 0] shared_nodes_end = [x for x in shared_nodes if in_d[x] > 0] # Add tips end & start in shared_nodes_end & start out_degree_g_testclean_dict = self.dbgclean.out_degree() in_degree_g_testclean_dict = self.dbgclean.in_degree() out_degree_g_ref_dict = G_ref.out_degree() in_degree_g_ref_dict = G_ref.in_degree() end_tips_list = [key for key, v in self.dbgclean.out_degree().items() if out_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_end_set] start_tips_list = [key for key, v in self.dbgclean.in_degree().items() if in_degree_g_testclean_dict[key] == 0 and key not in G_ref and key in self.kmer_start_set] shared_nodes_start.extend(start_tips_list) shared_nodes_end.extend(end_tips_list) # Search for alternative paths for node_start in shared_nodes_start: start_node = node_start for node_end in shared_nodes_end: end_node = node_end for alternative_path in nx.all_simple_paths(self.dbg_refrm, node_start, node_end): if len(set(alternative_path) & G_ref_nodes_set) > 2: continue # Compute coverage of the altenative path total_coverage = max([self.total_coverage_node(alt_nodes) for alt_nodes in alternative_path]) # Read intersection of all nodes in the alt path for G_sample read_set_pathAlt_G_sample = [] for node in alternative_path: read_set_pathAlt_G_sample.append(set(self.dbg_refrm.node[node]['read_list_n'])) intersect_allnodes_pathAlt_G_sample = set.intersection(*read_set_pathAlt_G_sample) if len(intersect_allnodes_pathAlt_G_sample) <= total_coverage * min_support / 100: continue # Reference path choice # Replace start/end if it's a tips if node_start not in G_ref: logger.critical("The node %s (read support : %d) is a tip (start)", node_start, len(self.dbg_refrm.node[alternative_path[1]]['read_list_n'])) anchor = identify_anchor_kmer_in_reference_graph(G_ref, node_start, rightmost=node_end, path_length=len(alternative_path)) logger.critical("Node %s anchored to %s", node_start, anchor) node_start = anchor if node_end not in G_ref: logger.critical("The node %s (read support : %d) is a tip (end)", node_end, len(self.dbg_refrm.node[alternative_path[1]]['read_list_n'])) anchor = identify_anchor_kmer_in_reference_graph(G_ref, node_start, leftmost=node_start, path_length=len(alternative_path)) logger.critical("Node %s anchored to %s", node_end, anchor) node_end = anchor reference_path_list = [] reference_path = "" for i_path in nx.all_simple_paths(G_ref, node_start, node_end): reference_path_list.append(i_path) if len(reference_path_list) == 0: logger.critical("No reference path between %s and %s", node_start, node_end) logger.critical("Alternative path : %s", alternative_path) continue # if there is multiple references paths, check the largest read intersection # if read intersection are equal, the reference path is the one with the smaller delta size accordind to the alternative path if len(reference_path_list) > 1: logger.debug("Trying to identify actual reference") reference_path = reference_path_list[0] size_biggest_intersection = len(list(set(alternative_path) & set(reference_path))) logger.debug("Selected ref path num 0 with size %d", size_biggest_intersection) for i_reference_path in range(1, len(reference_path_list)): curr_reference_path = reference_path_list[i_reference_path] size_intersection = len(list(set(alternative_path) & set(curr_reference_path))) if size_intersection > size_biggest_intersection: size_biggest_intersection = size_intersection reference_path = curr_reference_path logger.debug("Switching to ref path num %d with size %d", i_reference_path, size_biggest_intersection) elif size_intersection == size_biggest_intersection: size_reference_path = len(reference_path) size_curr_reference_path = len(curr_reference_path) size_alternative_path = len(alternative_path) delta_1 = abs(size_reference_path - size_alternative_path) delta_2 = abs(size_curr_reference_path - size_alternative_path) if delta_2 < delta_1: size_biggest_intersection = size_intersection reference_path = curr_reference_path logger.debug("Switching to ref path num %d with size %d and deltas: %d--%d ", i_reference_path, size_biggest_intersection, delta_2, delta_1) assert reference_path assert size_biggest_intersection else: reference_path = reference_path_list[0] # Read intersection of all nodes in the reference path for g_patient condition = 0 read_set_pathRef_G_sample = [] for node in reference_path: if node not in self.dbg: condition = 1 logger.critical("Identified node %s absent from the input DBG", node) intersect_allnodes_pathRef_G_sample = "0" # Weird smoothing, TODO check with justine if required # intersect_allnodes_pathRef_G_sample = [] break read_set_pathRef_G_sample.append(set(self.dbg.node[node]['read_list_n'])) if condition == 0: intersect_allnodes_pathRef_G_sample = set.intersection(*read_set_pathRef_G_sample) if abs(len(reference_path) - len(alternative_path)) > max_len: logger.critical("Disregarding large alteration %s vs %s", reference_path, alternative_path) continue reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length) # Decompose path if it is multiple for atomic_sequence, atomic_path in decompose_multiple_alterations(reference_path, alternative_path, kmer_length): self.alteration_list.append(ALT(reference_path, atomic_path, reference_sequence, atomic_sequence, len(intersect_allnodes_pathRef_G_sample), len(intersect_allnodes_pathAlt_G_sample), kmer_length, max(self.total_coverage_node(node_start), self.total_coverage_node(node_end)) * min_support / 100)) # Replace start/end if it was a tips node_end = end_node node_start = start_node