def reconstruction(kmers): # Use De Bruijn approach to create adjacency dictionary from list of k-mers adj_dict = debruijn2(kmers) # Create dictionary that tracks remaining edges (those not yet taken) remain_d = deepcopy(adj_dict) # Initializes Counter that will track which nodes are unbalanced, # which gives us the start and finish find_ends = Counter() # Go through each node, using the counter find_ends to count how many nodes # are adjacent to the current node and subtract 1 for each of those adj for row in remain_d.items(): dir_out = row[0] dir_ins = row[1] for ins in dir_ins: find_ends[ins] -= 1 find_ends[dir_out] += 1 # The most_common function return a list of elements and their counts # from the most common to the least. # The starting node will be most common, as there is 1 less node # adjacent to it, while the ending node will have 1 less adjacency node = find_ends.most_common()[0][0] # Begin to reconstruct the string reconstructed = node # Continue as long as there are edges remaining untaken while len(remain_d) > 0: node = remain_d.pop(node)[0] reconstructed = reconstructed + node[-1] return reconstructed
def reconstruction(kmers): # Use De Bruijn approach to create adjacency dictionary from list of k-mers adj_dict = debruijn2(kmers) # Create dictionary that tracks remaining edges (those not yet taken) remain_d = deepcopy(adj_dict) # Initializes Counter that will track which nodes are unbalanced, # which gives us the start and finish find_ends = Counter() # Go through each node, using the counter find_ends to count how many nodes # are adjacent to the current node and subtract 1 for each of those adj for row in remain_d.items(): dir_out = row[0] dir_ins = row[1] for ins in dir_ins: find_ends[ins] -= 1 find_ends[dir_out] += 1 # The most_common function return a list of elements and their counts # from the most common to the least. # The starting node will be most common, as there is 1 less node # adjacent to it, while the ending node will have 1 less adjacency node = find_ends.most_common()[0][0] # Begin to reconstruct the string reconstructed = node # Continue as long as there are edges remaining untaken while len(remain_d) > 0: node = remain_d.pop(node)[0] reconstructed = reconstructed+node[-1] return reconstructed
def contig(kmers): # k = len(kmers[0]) adj_dict = debruijn2(kmers) remain_d = deepcopy(adj_dict) # Initialize empty lists in which nobranching paths and isolated cycles # will be added paths = [] cycles = [] # Initializes Counter that will track the number of OUTGOING edges # for each node and another Counter to track INCOMING edges v_out = Counter() v_in = Counter() # For each node, increment the OUT counter for each connecting node while # also incrementing the IN counter for those connected nodes for row in adj_dict.items(): dir_out = row[0] dir_ins = row[1] for ins in dir_ins: v_in[ins] += 1 v_out[dir_out] += 1 # Now that we have the counts to determine if a node is one-in-one-out, # we will iterate through the nodes to find nonbranching paths and remove # them once they've been utilized for row in v_out.items(): node = row[0] count = row[1] # If the node is NOT a 1-in-1-out node, it can serve as starting point # because it marks a branch if count > 1 or count != v_in[node]: for i, edge in enumerate(adj_dict[node]): path = node + edge[-1] remain = adj_dict[node][i:] remain_d[node] = remain # If added edge is a a 1-in-1-out node, continue to extend the # path until you hit a node that branches while v_out[edge] == 1 and v_in[edge] == 1: path = path + adj_dict[edge][0][-1] # BUG: modifying edge inside a program which iterates # through edge is not good practice - alternative approach? edge = adj_dict[edge][0] # Add path to collection of nonbranching paths paths.append(path) # Find isolated cycles in the graph # ---QUESTION: if a cycle is isolated (disconnected from the rest of the graph) # ---but has a branch in it, should it be added as a nonbranching path? # ---Under this code, it is not. # If the node is 1-in-1-out, it may be the start of an isolated cycle elif count == 1 and v_in[node] == 1: start = node edge = adj_dict[node][0] cycle = start + edge[-1] # If added edge is a a 1-in-1-out node, continue to extend the # path until you hit a node that branches (eliminating the path as # a possible cycle) or the starting node (completing the cycle) while v_out[edge] == 1 and v_in[edge] == 1: cycle = cycle + adj_dict[edge][0][-1] edge = adj_dict[edge][0] if edge == start: cycles.append(cycle) break # Filter cycles for uniques uniq_cyc_nodes = [] for cyc in cycles: # Check if the starting node of the cycle has already been used in a # cycle. If it has not, add the cycle to path and its nodes to a tracker. if cyc[0] not in uniq_cyc_nodes: paths.append(cyc) for i, n in enumerate(cyc): uniq_cyc_nodes.append(n) return paths
def universal(k): #generate list of possible k-mers kmers = condensed_list_b(k) # Use De Bruijn approach to create adjacency dictionary from list of k-mers adj_dict = debruijn2(kmers) # Dictionary that tracks remaining edges (those not yet taken), # initialized as the input adjacency dict remain_d = deepcopy(adj_dict) # Randomly select a starting node node_i = random.choice(range(len(adj_dict))) node = adj_dict.items()[node_i][0] # Initialize string with the starting node u_string = node # print u_string # Continue as long as there are edges that remain untaken while len(remain_d) > 0: # Check if any adjacencies remain for the node and if so, how many value = remain_d.get(node) # print "node is: "+node # print "value: ", # print value # print remain_d # If the node has no unused edges, we must be back at V0 (since the # graph is balanced), but we also know there are remaining edges out # there. We need to expand the circle until it encompasses all nodes. if value == None: # print "Value is none" # To do so, iterate through the current "cycle" until we find a # node with an unused edge. Make that the new V0. for i, n in enumerate(u_string): n = u_string[i:i+k-1] if remain_d.get(n) > 0: node = n u_string = u_string[i:]+u_string[(k-1):i+(k-1)] # print "new node is: "+str(node) # print "u_string now is:"+u_string break # If the node has a single unused edge, simply use it to continue the # cycle by adding it to the list 'cycle' and removing it from the dict elif len(value) == 1: # print "Value is one" node = remain_d.pop(node)[0] u_string = u_string+node[-1] # print u_string # If the node has multiple unused edges, randomly select one to add to # the cycle and remove it out from the node's list of adjacencies. elif len(value) > 1: # print "Value is greater than one" random_i = random.randrange(len(value)) pos_nodes = remain_d[node] new_node = pos_nodes.pop(random_i) remain_d[node] = pos_nodes node = new_node u_string = u_string+node[-1] # print u_string # Since the string is a circle, the last k-1 digits will be a repeat of the # first k-1 digits and thus can be removed. return u_string[:-(k-1)]