def reconstruction(kmers):

    # Use De Bruijn approach to create adjacency dictionary from list of k-mers
    adj_dict = debruijn2(kmers)

    # Create dictionary that tracks remaining edges (those not yet taken)
    remain_d = deepcopy(adj_dict)

    # Initializes Counter that will track which nodes are unbalanced,
    # which gives us the start and finish
    find_ends = Counter()

    # Go through each node, using the counter find_ends to count how many nodes
    # are adjacent to the current node and subtract 1 for each of those adj
    for row in remain_d.items():
        dir_out = row[0]
        dir_ins = row[1]

        for ins in dir_ins:
            find_ends[ins] -= 1
            find_ends[dir_out] += 1

    # The most_common function return a list of elements and their counts
    # from the most common to the least.
    # The starting node will be most common, as there is 1 less node
    # adjacent to it, while the ending node will have 1 less adjacency
    node = find_ends.most_common()[0][0]

    # Begin to reconstruct the string
    reconstructed = node

    # Continue as long as there are edges remaining untaken
    while len(remain_d) > 0:
        node = remain_d.pop(node)[0]
        reconstructed = reconstructed + node[-1]

    return reconstructed
def reconstruction(kmers):
    
    # Use De Bruijn approach to create adjacency dictionary from list of k-mers
    adj_dict = debruijn2(kmers)    

    # Create dictionary that tracks remaining edges (those not yet taken)
    remain_d = deepcopy(adj_dict)

    # Initializes Counter that will track which nodes are unbalanced, 
    # which gives us the start and finish    
    find_ends = Counter()

    # Go through each node, using the counter find_ends to count how many nodes 
    # are adjacent to the current node and subtract 1 for each of those adj       
    for row in remain_d.items():
        dir_out = row[0]
        dir_ins = row[1]        
                
        for ins in dir_ins:
            find_ends[ins] -= 1
            find_ends[dir_out] += 1
            
    # The most_common function return a list of elements and their counts 
    # from the most common to the least.
    # The starting node will be most common, as there is 1 less node 
    # adjacent to it, while the ending node will have 1 less adjacency
    node = find_ends.most_common()[0][0]
           
    # Begin to reconstruct the string
    reconstructed = node
    
    # Continue as long as there are edges remaining untaken    
    while len(remain_d) > 0:
        node = remain_d.pop(node)[0]
        reconstructed = reconstructed+node[-1]
        
    return reconstructed  
def contig(kmers):
    
#    k = len(kmers[0])
    adj_dict = debruijn2(kmers)
    remain_d = deepcopy(adj_dict)
    
    # Initialize empty lists in which nobranching paths and isolated cycles 
    # will be added
    paths = []
    cycles = []

    # Initializes Counter that will track  the number of OUTGOING edges
    # for each node and another Counter to track INCOMING edges
    v_out = Counter()       
    v_in = Counter()   
    
    # For each node, increment the OUT counter for each connecting node while 
    # also incrementing the IN counter for those connected nodes 
    for row in adj_dict.items():
        dir_out = row[0]
        dir_ins = row[1]                     
        for ins in dir_ins:
            v_in[ins] += 1
            v_out[dir_out] += 1
    
    # Now that we have the counts to determine if a node is one-in-one-out, 
    # we will iterate through the nodes to find nonbranching paths and remove
    # them once they've been utilized
    for row in v_out.items():
        node = row[0]
        count = row[1]

        # If the node is NOT a 1-in-1-out node, it can serve as starting point
        # because it marks a branch
        if count > 1 or count != v_in[node]:          
            for i, edge in enumerate(adj_dict[node]):
                path = node + edge[-1]
                remain = adj_dict[node][i:]
                remain_d[node] = remain
                
                # If added edge is a a 1-in-1-out node, continue to extend the 
                # path until you hit a node that branches
                while v_out[edge] == 1 and v_in[edge] == 1:
                    path = path + adj_dict[edge][0][-1]
                    
                    # BUG: modifying edge inside a program which iterates 
                    # through edge is not good practice - alternative approach?
                    edge = adj_dict[edge][0]
                    
                # Add path to collection of nonbranching paths
                paths.append(path)
                
                
        # Find isolated cycles in the graph
        # ---QUESTION: if a cycle is isolated (disconnected from the rest of the graph) 
        # ---but has a branch in it, should it be added as a nonbranching path?
        # ---Under this code, it is not.
        
        # If the node is 1-in-1-out, it may be the start of an isolated cycle
        elif count == 1 and v_in[node] == 1:
            start = node
            edge = adj_dict[node][0]
            cycle = start + edge[-1]
            # If added edge is a a 1-in-1-out node, continue to extend the 
            # path until you hit a node that branches (eliminating the path as 
            # a possible cycle) or the starting node (completing the cycle)
            while v_out[edge] == 1 and v_in[edge] == 1:
                cycle = cycle + adj_dict[edge][0][-1]
                edge = adj_dict[edge][0]
                if edge == start:
                    cycles.append(cycle)
                    break
                
    # Filter cycles for uniques
    uniq_cyc_nodes = []
    for cyc in cycles:
        # Check if the starting node of the cycle has already been used in a 
        # cycle. If it has not, add the cycle to path and its nodes to a tracker.
        if cyc[0] not in uniq_cyc_nodes:
            paths.append(cyc)
            for i, n in enumerate(cyc):
                uniq_cyc_nodes.append(n)
            
            
    return paths
Beispiel #4
0
def universal(k):
    
    #generate list of possible k-mers
    kmers = condensed_list_b(k)
    
    # Use De Bruijn approach to create adjacency dictionary from list of k-mers
    adj_dict = debruijn2(kmers)    
    
    # Dictionary that tracks remaining edges (those not yet taken),
    # initialized as the input adjacency dict
    remain_d = deepcopy(adj_dict)
    
    # Randomly select a starting node
    node_i = random.choice(range(len(adj_dict)))
    node = adj_dict.items()[node_i][0]
    
    # Initialize string with the starting node
    u_string = node
#    print u_string

    # Continue as long as there are edges that remain untaken    
    while len(remain_d) > 0:

        # Check if any adjacencies remain for the node and if so, how many        
        value = remain_d.get(node)
#        print "node is: "+node
#        print "value: ",
#        print value
#        print remain_d
        # If the node has no unused edges, we must be back at V0 (since the 
        # graph is balanced), but we also know there are remaining edges out 
        # there. We need to expand the circle until it encompasses all nodes.
        if value == None:
#            print "Value is none"
            # To do so, iterate through the current "cycle" until we find a 
            # node with an unused edge. Make that the new V0.
            for i, n in enumerate(u_string):
                n = u_string[i:i+k-1]
                if remain_d.get(n) > 0:
                    node = n
                    u_string = u_string[i:]+u_string[(k-1):i+(k-1)]
#                    print "new node is: "+str(node)
#                    print "u_string now is:"+u_string
                    break
                
        # If the node has a single unused edge, simply use it to continue the 
        # cycle by adding it to the list 'cycle' and removing it from the dict
        elif len(value) == 1:   
#            print "Value is one"  
            node = remain_d.pop(node)[0]
            u_string = u_string+node[-1]
#            print u_string
            
        # If the node has multiple unused edges, randomly select one to add to 
        # the cycle and remove it out from the node's list of adjacencies.
        elif len(value) > 1:
#            print "Value is greater than one"  
            random_i = random.randrange(len(value))
            pos_nodes = remain_d[node]
            new_node = pos_nodes.pop(random_i) 
            remain_d[node] = pos_nodes
            node = new_node
            u_string = u_string+node[-1]
#            print u_string
            
    # Since the string is a circle, the last k-1 digits will be a repeat of the 
    # first k-1 digits and thus can be removed.
    return u_string[:-(k-1)]