コード例 #1
0
def main():

    num_leaves = 100
    mutation_rate = 1.0
    genome_length = 1000
    recomb_rate = 10.0
    dot_check = 0
    compare_check = 0
    tree_check = 0

    for i in range(len(sys.argv)):
        if sys.argv[i] == "-N":
            num_leaves = int(sys.argv[i+1])
        elif sys.argv[i] == "-M":
            mutation_rate = float(sys.argv[i+1])
        elif sys.argv[i] == "-G":
            genome_length = int(sys.argv[i+1])
        elif sys.argv[i] == "-R":
            recomb_rate = float(sys.argv[i+1])
        elif sys.argv[i] == "-D":
            dot_fileName = (sys.argv[i+1])
            dot_check = 1
        elif sys.argv[i] == "-n":
            nucleotide = int(sys.argv[i+1])
        elif sys.argv[i] == "-c":
            compare_check = 1
        elif sys.argv[i] == "-o":
            tree_check = 1
        elif (sys.argv[i] == "-h") or (sys.argv[i] == "--help"):
            print "Arguments to coal_recombAncestral.py as follows:"
            print "-N\tNumber of leaves (default 100)"
            print "-M\tMutation rate (default 1.0)"
            print "-G\tGenome length (default 1000)"
            print "-R\tRecombination rate (default 10.0)"
            print "-D\tWrite DOT file <file name> (default no file written)"
            print "-n\tNucleotide of interest for DOT file (default middle nucleotide)"
            print "-c\tCompare marginal trees to find detected recombination points (default FALSE)"
            print "-o\tCalculate marginal trees and simulate genomes (default FALSE)"
            sys.exit()


    print ("Number of leaves: " + str(num_leaves))
    print ("Mutation rate: " + str(mutation_rate))
    print ("Genome length: " + str(genome_length))
    print ("Recombination rate: " + str(recomb_rate))


    if dot_check == 1:
        print ("Writing DOT file...")
        if 'nucleotide' in locals():
            print ("Nucleotide of interest: " + str(nucleotide))
        else:
            nucleotide = int(math.floor((genome_length-1) / 2))
            print ("Nucleotide of interest: " + str(nucleotide) + "(default)")

        #Write DOT file header
        dot_file  = open(dot_fileName,"w")
        dot_file.write('digraph asde91 {fontsize=5;ranksep="0.02";ratio=fill;size="10,10";\n')
        dot_file.write('edge[arrowhead=none];\n')
        dot_file.write('{rank=same;')
        for i in range(num_leaves):
            dot_file.write('%d[shape=point] ' % i)
        dot_file.write('}\n')

        graph_nodes = []
        graph_edges = []
        for i in range(num_leaves):
            graph_nodes.append(str(i) + '[shape=point,width=0.00,height=0.00]\n')

    print graph_nodes[0]

    #File to store recombination points
    recomb_file = open("recombination_points.log","w")

    #Define Node class
    class Node:
        def __init__(self):
            self.age = 0.0
            self.children = []
            self.parent = []
            self.ancestral_material = []
            self.tot_ancestralMaterial = 0
            self.ancestral = 0
            self.label = ''
            self.sequence = ''

    #Define initial leaf nodes
    node_list = []
    for i in range(num_leaves):
        node_list.append(Node())
        node_list[i].ancestral_material = [[0,genome_length-1]]
        node_list[i].ancestral = 1
        node_list[i].label = str(i)
        node_list[i].number = i
        node_list[i].tot_ancestralMaterial = genome_length

    live_nodeList = []
    for i in range(num_leaves):
        live_nodeList.append(i)

    remaining_nodes = num_leaves
    new_node = num_leaves
    current_time = 0.0

    #########################################
    # Simulate coalescent-recombination graph
    #########################################

    while remaining_nodes > 1:
        current_ancestralMaterial = 0
        for i in live_nodeList:
            current_ancestralMaterial = current_ancestralMaterial + node_list[i].tot_ancestralMaterial

        #Simulate time to next event
        current_time = current_time - 2 * np.log(1 - np.random.rand(1)) / (remaining_nodes * (remaining_nodes - 1) + recomb_rate * (float(current_ancestralMaterial / genome_length)))

        #Determine whether event is coalescent or recombination
        reac_rand = np.random.rand(1)
        if reac_rand[0] < (float(remaining_nodes * (remaining_nodes - 1)) / (remaining_nodes * (remaining_nodes - 1)  + recomb_rate * (float(current_ancestralMaterial / genome_length)))):
            #Coalescent event occurs

            #Choose two children nodes to coalesce
            children = random.sample(live_nodeList,2)
            #Set the parent of the two children
            node_list[children[0]].parent.append(new_node)
            node_list[children[1]].parent.append(new_node)

            #Make new parent node
            node_list.append(Node())
            node_list[new_node].children = children
            node_list[new_node].age = current_time
            node_list[new_node].number = new_node

            # Create the combined parental ancestral material
            # Combine the two ancestries into one list
            # Call function to combine two parts of ancestral material
            new_ancestry = combine_ancestralMaterial(node_list[children[0]].ancestral_material,node_list[children[1]].ancestral_material)
            node_list[new_node].ancestral_material = new_ancestry[:]

            node_list[new_node].tot_ancestral_material = calculate_totAncestry(node_list[new_node].ancestral_material)

            if dot_check == 1:
                # Check if new parent is ancestral for nucleotide of interest
                ancestral_check = 0
                for i in range(len(node_list[new_node].ancestral_material)):
                    if (nucleotide >= node_list[new_node].ancestral_material[i][0]) and (nucleotide <= node_list[new_node].ancestral_material[i][1]):
                        ancestral_check = 1
                        break
                if ancestral_check == 1:
                    node_list[new_node].ancestral = 1

                [graph_nodes,graph_edges] = DOT_coal(node_list[new_node].ancestral,new_node,children,node_list[children[0]].ancestral,node_list[children[1]].ancestral,graph_nodes,graph_edges)

            temp_liveNodes = []
            temp_liveNodes.append(new_node)
            for i in live_nodeList:
                if (i != children[0]) and (i != children[1]):
                    temp_liveNodes.append(i)
            live_nodeList = temp_liveNodes[:]

            remaining_nodes = remaining_nodes-1
            new_node = new_node + 1

        else:
            #Recombination event ocurrs

            #Choose a child weighted by amount of remaining ancestral material
            ancestral_array = np.zeros(len(live_nodeList))
            for i in range(len(live_nodeList)):
                ancestral_array[i] = float(node_list[live_nodeList[i]].tot_ancestralMaterial) / current_ancestralMaterial
            cumsum_ancestral = np.cumsum(ancestral_array)
            recomb_rand = np.random.rand(1)
            for i in range(len(live_nodeList)):
                if recomb_rand <= cumsum_ancestral[i]:
                    child = live_nodeList[i]

            #Choose which point of the ancestral material is recombinant
            break_point = random.sample(range(node_list[child].ancestral_material[0][0],node_list[child].ancestral_material[len(node_list[child].ancestral_material)-1][1]),1)
            break_point = break_point[0]

            #Write accepted recombination point to file
            recomb_file.write("%d\n" % break_point)

            #Set the two parents
            node_list[child].parent = [new_node,new_node+1]

            #Make new parent nodes
            node_list.append(Node())
            node_list.append(Node())
            node_list[new_node].children.append(child)
            node_list[new_node+1].children.append(child)
            node_list[new_node].age = current_time
            node_list[new_node+1].age = current_time
            node_list[new_node].number = new_node
            node_list[new_node+1].number = new_node+1

            # Split ancestral material into two parents
            # Call function to split ancestral material
            [node_list[new_node].ancestral_material,node_list[new_node+1].ancestral_material] = split_ancestry(node_list[child].ancestral_material,break_point)

            node_list[new_node].tot_ancestral_material = calculate_totAncestry(node_list[new_node].ancestral_material)
            node_list[new_node+1].tot_ancestral_material = calculate_totAncestry(node_list[new_node+1].ancestral_material)


            if dot_check == 1:
                #Check if two parents are ancestral to nucleotide of interest
                ancestral_check = 0
                for i in range(len(node_list[new_node].ancestral_material)):
                    if (nucleotide >= node_list[new_node].ancestral_material[i][0]) and (nucleotide <= node_list[new_node].ancestral_material[i][1]):
                        ancestral_check = 1
                        break
                if ancestral_check == 1:
                    node_list[new_node].ancestral = 1
                ancestral_check = 0
                for i in range(len(node_list[new_node+1].ancestral_material)):
                    if (nucleotide >= node_list[new_node+1].ancestral_material[i][0]) and (nucleotide <= node_list[new_node+1].ancestral_material[i][1]):
                        ancestral_check = 1
                        break
                if ancestral_check == 1:
                    node_list[new_node+1].ancestral = 1

                #Call function to write recombination nodes and edges to file
                [graph_nodes,graph_edges] = DOT_recomb(new_node,new_node+1,child,node_list[child].ancestral,node_list[new_node].ancestral,node_list[new_node+1].ancestral,graph_nodes,graph_edges)

            if (len(node_list[new_node].ancestral_material) == 0) or (len(node_list[new_node+1].ancestral_material) == 0):
                print ("Child: "  +str(node_list[child].ancestral_material))
                print ("Break point: " + str(break_point))
                print ("Recombination parent 1: " + str(node_list[new_node].ancestral_material))
                print ("Recombination parent 2: " + str(node_list[new_node+1].ancestral_material))

            temp_liveNodes = []
            temp_liveNodes.append(new_node)
            temp_liveNodes.append(new_node+1)
            for i in live_nodeList:
                if (i != child):
                    temp_liveNodes.append(i)
            live_nodeList = temp_liveNodes[:]

            remaining_nodes = remaining_nodes + 1
            new_node = new_node + 2
        #END ELSE
    #END WHILE

    recomb_file.close()

    if dot_check == 1:
        for i in range(len(graph_nodes)):
            dot_file.write(str(graph_nodes[i]))
        for i in range(len(graph_edges)):
            dot_file.write(str(graph_edges[i]))
        dot_file.write('}')
        dot_file.close()

    #Check for any ancestral errors
    for i in range(genome_length):
        check = 0
        for j in range(len(node_list[len(node_list)-1].ancestral_material)):
            if (i >= node_list[len(node_list)-1].ancestral_material[j][0]) and (i <= node_list[len(node_list)-1].ancestral_material[j][1]):
                check = 1
        if check == 0:
            print ("Error, nucleotide: " + str(i))
            print ("Root ancestry: " + str(node_list[len(node_list)-1].ancestral_material))
            sys.exit()


    #########################################
    # Create clonal trees for each nucleotide
    #########################################

    if tree_check == 1:
        clonal_trees = []
        #Create tree for each nucleotide
        tree_file = open('trees.nwk','w')
        #for nuc in range(genome_length):
        print "Calculating marginal trees and simulating sequences..."
        for nuc in range(genome_length):
            #Copy node list
            temp_list = []
            temp_list = deepcopy(node_list)
            #Create a list of all clonal nodes
            clonal_list = []
            clonal_list = create_initialList(temp_list,nuc)

            #Remove un-ancestral parents and children
            clonal_list = remove_unancestral(temp_list,clonal_list)

            #Remove recombinant nodes
            [temp_list,clonal_list] = remove_recomb(temp_list,clonal_list)

            #Remove last node if recombination causes elongation of the tree
            if len(temp_list[clonal_list[-1]].children) == 1:
                clonal_list = clonal_list[0:-1]

            #Set last node as root
            temp_list[clonal_list[-1]].parent = []

            # Calculate Newick tree for nucleotide and write to file
            newick_tree = []
            newick_tree = create_newick(temp_list,clonal_list,num_leaves)
            newick_tree = newick_tree + ";"

            tree_file.write("[%d]%s\n" % (nuc,newick_tree))
            clonal_trees.append(newick_tree)

            #Simulate nucleotide down clonal frame
            node_list = simulate_nuc(nuc,node_list,temp_list,clonal_list,mutation_rate,num_leaves)

        sequence_file = open("genomes.fasta","w")
        for i in range(num_leaves):
            sequence_file.write(">%d\n" % i)
            sequence_file.write("%s\n" % node_list[i].sequence)
        sequence_file.close()
        if compare_check == 1:
            print "Comparing marginal trees..."
            tree_comparisons = np.zeros((genome_length,genome_length))
            for i in range(genome_length):
                for j in range(genome_length):
                    if (j == i+1) or (j == i-1):
                        tree_comparisons[i][j] = compare_trees(clonal_trees[i],clonal_trees[j])
                    elif (i == j):
                        tree_comparisons[i][j] = 0.0
                    else:
                        tree_comparisons[i][j] = float('nan')
            #Plot heat map of distance metric
            img = plt.pcolor(tree_comparisons)
            plt.colorbar(img)
            plt.show(img)

        tree_file.close()
コード例 #2
0
def main():

    #random.seed(0)
    #np.random.seed(0)

    #Set default values for each user-defined variable
    num_leaves = 100
    mutation_rate = 0.1
    genome_length = 1000
    recomb_rate = 10.0
    dot_check = 0
    compare_check = 0
    tree_check = 0
    break_rate = 0.02
    genome_check = 0

    #Search command-line input for relevant variables
    for i in range(len(sys.argv)):
        if sys.argv[i] == "-N":
            num_leaves = int(sys.argv[i+1])
        elif sys.argv[i] == "-M":
            mutation_rate = float(sys.argv[i+1])
        elif sys.argv[i] == "-G":
            genome_length = int(sys.argv[i+1])
        elif sys.argv[i] == "-R":
            recomb_rate = float(sys.argv[i+1])
        elif sys.argv[i] == "-D":
            dot_fileName = (sys.argv[i+1])
            dot_check = 1
        elif sys.argv[i] == "-n":
            nucleotide = int(sys.argv[i+1])
        elif sys.argv[i] == "-c":
            compare_check = 1
        elif sys.argv[i] == "-o":
            tree_check = 1
        elif sys.argv[i] == "-b":
            break_rate = 1/(float(sys.argv[i+1]))
        elif sys.argv[i] == "-g":
            genome_check = 1;
        elif (sys.argv[i] == "-h") or (sys.argv[i] == "--help"):
            print "Arguments to coal_recombAncestral.py as follows:"
            print "-N\tNumber of leaves (default 100)"
            print "-M\tMutation rate (default 0.1)"
            print "-G\tGenome length (default 1000)"
            print "-R\tRecombination rate (default 10.0)"
            print "-D\tWrite DOT file <file name> (default no file written)"
            print "-n\tNucleotide of interest for DOT file (default middle nucleotide)"
            print "-c\tCompare marginal trees to find detected recombination points (default FALSE)"
            print "-o\tCalculate marginal trees and simulate genomes (default FALSE)"
            print "-b\tAverage recombinant break length (default 50)"
            print "-g\tSimulate genomes down the tree (default FALSE)"
            sys.exit()


    print ("Number of leaves: " + str(num_leaves))
    print ("Mutation rate: " + str(mutation_rate))
    print ("Genome length: " + str(genome_length))
    print ("Recombination rate: " + str(recomb_rate))
    print ("Average recombination break length: " + str(1/break_rate))

    if dot_check == 1:
        print ("Writing DOT file...")
        if 'nucleotide' in locals():
            print ("Nucleotide of interest: " + str(nucleotide))
        else:
            nucleotide = int(math.floor((genome_length-1) / 2))
            print ("Nucleotide of interest: " + str(nucleotide) + "(default)")

        #Write DOT file header
        dot_file  = open(dot_fileName,"w")
        dot_file.write('digraph asde91 {fontsize=5;ranksep="0.02";ratio=fill;size="10,10";\n')
        dot_file.write('edge[arrowhead=none];\n')
        dot_file.write('{rank=same;')
        for i in range(num_leaves):
            dot_file.write('%d[shape=point] ' % i)
        dot_file.write('}\n')

        #Nodes and edges of the ARG are stored here and are written to the
        #DOT file later
        graph_nodes = []
        graph_edges = []
        for i in range(num_leaves):
            graph_nodes.append(str(i) + '[shape=point,width=0.00,height=0.00]\n')

    #File to store recombination breaks
    recomb_file = open("recombination_points.log","w")

    #Define Node class
    class Node:
        def __init__(self):
            self.age = 0.0
            self.children = []
            self.parent = []
            self.ancestral_material = []
            self.ancestral = 0
            self.label = ''
            self.sequence = ''
            self.interval_starts = []
            self.recomb_rate = 0

    #Define initial leaf nodes
    node_list = []
    for i in range(num_leaves):
        node_list.append(Node())
        node_list[i].ancestral_material = [[0,genome_length-1]]
        node_list[i].ancestral = 1
        node_list[i].label = str(i)
        node_list[i].recomb_rate = clonal_recomb(node_list[i].ancestral_material,node_list[i].interval_starts,recomb_rate,genome_length,break_rate)\
         - non_clonalRecomb(node_list[i].ancestral_material,node_list[i].interval_starts,recomb_rate,genome_length,break_rate)

    #List of nodes which have not yet undergone coalescence or recombination
    live_nodeList = []
    for i in range(num_leaves):
        live_nodeList.append(i)

    new_node = num_leaves
    current_time = 0.0

    num_recomb = 0

    #########################################
    # Simulate coalescent-recombination graph
    #########################################

    while len(live_nodeList) > 1:
        current_recomb = 0
        for i in live_nodeList:
            current_recomb = current_recomb + node_list[i].recomb_rate
        #Simulate time to next event, exponential with rate: (k(k-1)/2 + rho)
        current_time = current_time - 2 * np.log(1 - np.random.rand(1)) / (len(live_nodeList) * (len(live_nodeList) - 1) + (2 * current_recomb))
        #Determine whether event is coalescent or recombination. Reaction is
        #coalescent with probability: (k(k-1))/(k(k-1) + 2*rho)
        reac_rand = np.random.rand(1)
        if reac_rand[0] < (float(len(live_nodeList)*(len(live_nodeList) - 1)) / (len(live_nodeList)*(len(live_nodeList) - 1) + (2 * current_recomb))):
            #Coalescent event occurs
            #Choose two children nodes to coalesce
            children = random.sample(live_nodeList,2)
            #Set the parent of the two children
            node_list[children[0]].parent.append(new_node)
            node_list[children[1]].parent.append(new_node)

            #Make new parent node
            node_list.append(Node())
            node_list[new_node].children = children
            node_list[new_node].age = current_time

            # Create the combined parental ancestral material
            # Combine the two children ancestries into one list
            node_list[new_node].ancestral_material = combine_ancestralMaterial(node_list[children[0]].ancestral_material,node_list[children[1]].ancestral_material)

            #If the new ancestry is the same as one of the children, the local
            #recombination rate does not need to be calculated
            if node_list[new_node].ancestral_material == node_list[children[0]].ancestral_material:
                node_list[new_node].recomb_rate = node_list[children[0]].recomb_rate
                node_list[new_node].interval_starts = node_list[children[0]].interval_starts
            elif node_list[new_node].ancestral_material == node_list[children[1]].ancestral_material:
                node_list[new_node].recomb_rate = node_list[children[1]].recomb_rate
                node_list[new_node].interval_starts = node_list[children[1]].interval_starts
            else:
                #If the ancestral material is a new set of intervals, calculate the new recombination rate
                node_list[new_node].interval_starts = coal_ancestryStarts(node_list[new_node].ancestral_material,genome_length)
                node_list[new_node].recomb_rate = clonal_recomb(node_list[new_node].ancestral_material,node_list[new_node].interval_starts,recomb_rate,genome_length,break_rate)\
                 - non_clonalRecomb(node_list[new_node].ancestral_material,node_list[new_node].interval_starts,recomb_rate,genome_length,break_rate)

            #Write the new node and edges to the DOT file
            if dot_check == 1:
                # Check if new parent is ancestral for nucleotide of interest
                ancestral_check = 0
                for i in range(len(node_list[new_node].ancestral_material)):
                    if (nucleotide >= node_list[new_node].ancestral_material[i][0]) and (nucleotide <= node_list[new_node].ancestral_material[i][1]):
                        ancestral_check = 1
                        break
                if ancestral_check == 1:
                    node_list[new_node].ancestral = 1

                [graph_nodes,graph_edges] = DOT_coal(node_list[new_node].ancestral,new_node,children,node_list[children[0]].ancestral,node_list[children[1]].ancestral,graph_nodes,graph_edges)

            #Remove the children from the live node list and add the new parent
            temp_liveNodes = []
            temp_liveNodes.append(new_node)
            for i in live_nodeList:
                if (i != children[0]) and (i != children[1]):
                    temp_liveNodes.append(i)
            live_nodeList = temp_liveNodes[:]

            #Update the index which any new node will be given
            new_node = new_node + 1

        else:
            #Recombination event ocurrs
            num_recomb = num_recomb + 1

            #Choose a child weighted by local recombination rates
            local_recombRates = np.array([])
            for i in live_nodeList:
                local_recombRates = np.append(local_recombRates,node_list[i].recomb_rate)
            cum_recombRates = np.cumsum(local_recombRates)
            cum_recombRates = cum_recombRates/float(current_recomb)

            child_rand = np.random.rand(1)
            for i in range(len(cum_recombRates)):
                if child_rand[0] < cum_recombRates[i]:
                    child = live_nodeList[i]
                    break

            #Choose break start and end points given the ancestral material of
            #the chosen node
            [break_start,break_end] = recomb_interval(node_list[child].ancestral_material,node_list[child].interval_starts,genome_length,break_rate)

            #Write recombination points to file
            recomb_file.write("%d\t%d\n" % (break_start,break_end))

            #Set the two parents
            node_list[child].parent = [new_node,new_node+1]

            #Make new parent nodes
            node_list.append(Node())
            node_list.append(Node())
            node_list[new_node].children.append(child)
            node_list[new_node+1].children.append(child)
            node_list[new_node].age = current_time
            node_list[new_node+1].age = current_time

            # Split ancestral material into two parents
            # Call function to split ancestral material
            [node_list[new_node].ancestral_material,node_list[new_node+1].ancestral_material] = split_ancestry(node_list[child].ancestral_material,break_start,break_end,genome_length)

            #Count the number of recombination start sites in each new set of
            #ancestral intervals
            [node_list[new_node].interval_starts,node_list[new_node+1].interval_starts] = recomb_intervalStarts(node_list[new_node].ancestral_material,node_list[new_node+1].ancestral_material,genome_length)

            #Calculate recombination rate for each node (clonal recombination rate - non-clonal recombination rate)
            node_list[new_node].recomb_rate = clonal_recomb(node_list[new_node].ancestral_material,node_list[new_node].interval_starts,recomb_rate,genome_length,break_rate)\
            - non_clonalRecomb(node_list[new_node].ancestral_material,node_list[new_node].interval_starts,recomb_rate,genome_length,break_rate)

            node_list[new_node+1].recomb_rate = clonal_recomb(node_list[new_node+1].ancestral_material,node_list[new_node+1].interval_starts,recomb_rate,genome_length,break_rate)\
             - non_clonalRecomb(node_list[new_node+1].ancestral_material,node_list[new_node+1].interval_starts,recomb_rate,genome_length,break_rate)

            #Write new nodes and edges to DOT file
            if dot_check == 1:
                #Check if two parents are ancestral to nucleotide of interest
                ancestral_check = 0
                for i in range(len(node_list[new_node].ancestral_material)):
                    if (nucleotide >= node_list[new_node].ancestral_material[i][0]) and (nucleotide <= node_list[new_node].ancestral_material[i][1]):
                        ancestral_check = 1
                        break
                if ancestral_check == 1:
                    node_list[new_node].ancestral = 1
                ancestral_check = 0
                for i in range(len(node_list[new_node+1].ancestral_material)):
                    if (nucleotide >= node_list[new_node+1].ancestral_material[i][0]) and (nucleotide <= node_list[new_node+1].ancestral_material[i][1]):
                        ancestral_check = 1
                        break
                if ancestral_check == 1:
                    node_list[new_node+1].ancestral = 1
                #Call function to write recombination nodes and edges to file
                [graph_nodes,graph_edges] = DOT_recomb(new_node,new_node+1,child,node_list[child].ancestral,node_list[new_node].ancestral,node_list[new_node+1].ancestral,graph_nodes,graph_edges)

            #Check for empty ancestral material, implying incorrect recombination break
            if (len(node_list[new_node].ancestral_material) == 0) or (len(node_list[new_node+1].ancestral_material) == 0):
                 print ("Child: "  +str(node_list[child].ancestral_material))
                 print ("Break start: " + str(break_start) + " Break end: " + str(break_end))
                 print ("Recombination parent 1: " + str(node_list[new_node].ancestral_material))
                 print ("Recombination parent 2: " + str(node_list[new_node+1].ancestral_material))

            #Update live node list
            temp_liveNodes = []
            temp_liveNodes.append(new_node)
            temp_liveNodes.append(new_node+1)
            for i in live_nodeList:
                if (i != child):
                    temp_liveNodes.append(i)
            live_nodeList = temp_liveNodes[:]

            new_node = new_node + 2
        #END WHILE

    recomb_file.close()

    print num_recomb

    #for i in range(len(node_list)):
    #   print node_list[i].ancestral_material

    #Write complete list of nodes and edges to the DOT file
    if dot_check == 1:
        for i in range(len(graph_nodes)):
            dot_file.write(str(graph_nodes[i]))
        for i in range(len(graph_edges)):
            dot_file.write(str(graph_edges[i]))
        dot_file.write('}')
        dot_file.close()

    #Check for any nucleotides not present in the ancestral material
    #of the root node
    for i in range(genome_length):
        check = 0
        for j in range(len(node_list[len(node_list)-1].ancestral_material)):
            if (i >= node_list[len(node_list)-1].ancestral_material[j][0]) and (i <= node_list[len(node_list)-1].ancestral_material[j][1]):
                check = 1
        if check == 0:
            print ("Error, nucleotide: " + str(i))
            print ("Root ancestry: " + str(node_list[len(node_list)-1].ancestral_material))
            sys.exit()


    #########################################
    # Create clonal trees for each nucleotide
    #########################################

    if tree_check == 1:
        clonal_trees = []
        #Create tree for each nucleotide
        tree_file = open('trees.nwk','w')
        #for nuc in range(genome_length):
        print "Calculating marginal trees and simulating sequences..."
        for nuc in range(genome_length):
            print nuc
            #Copy node list
            temp_list = []
            temp_list = deepcopy(node_list)
            #Create a list of all clonal nodes
            clonal_list = []
            clonal_list = create_initialList(temp_list,nuc)

            #Remove un-ancestral parents and children
            clonal_list = remove_unancestral(temp_list,clonal_list)

            #Remove recombinant nodes
            [temp_list,clonal_list] = remove_recomb(temp_list,clonal_list)

            #Remove last node if recombination causes elongation of the tree
            if len(temp_list[clonal_list[-1]].children) == 1:
                clonal_list = clonal_list[0:-1]

            #Set last node as root
            temp_list[clonal_list[-1]].parent = []

            # Calculate Newick tree for nucleotide and write to file
            newick_tree = []
            newick_tree = create_newick(temp_list,clonal_list,num_leaves)
            newick_tree = newick_tree + ";"

            tree_file.write("[%d]%s\n" % (nuc,newick_tree))
            clonal_trees.append(newick_tree)

            if genome_check == 1:
                #Simulate nucleotide down clonal frame
                node_list = simulate_nuc(nuc,node_list,temp_list,clonal_list,mutation_rate,num_leaves)

            del temp_list

        tree_file.close()

        if genome_check == 1:
            #Write sequences of the leaves to file
            sequence_file = open("genomes.fasta","w")
            for i in range(num_leaves):
                sequence_file.write(">%d\n" % i)
                sequence_file.write("%s\n" % node_list[i].sequence)
            sequence_file.close()

        #Perform pairwise comparison of trees at each nucleotide
        if compare_check == 1:
            print "Comparing marginal trees..."
            tree_comparisons = np.zeros((genome_length,genome_length))
            for i in range(genome_length):
                print i
                for j in range(genome_length):
                    tree_comparisons[i][j] = compare_trees(clonal_trees[i],clonal_trees[j])
            #Plot heat map of distance metric
            img = plt.pcolor(tree_comparisons)
            plt.colorbar(img)
            plt.show(img)
コード例 #3
0
def main():

    #random.seed(0)
    #np.random.seed(0)

    num_leaves = 100
    mutation_rate = 0.1
    genome_length = 1000
    recomb_rate = 10.0
    dot_check = 0
    compare_check = 0
    tree_check = 0
    break_rate = 0.02

    for i in range(len(sys.argv)):
        if sys.argv[i] == "-N":
            num_leaves = int(sys.argv[i+1])
        elif sys.argv[i] == "-M":
            mutation_rate = float(sys.argv[i+1])
        elif sys.argv[i] == "-G":
            genome_length = int(sys.argv[i+1])
        elif sys.argv[i] == "-R":
            recomb_rate = float(sys.argv[i+1])
        elif sys.argv[i] == "-D":
            dot_fileName = (sys.argv[i+1])
            dot_check = 1
        elif sys.argv[i] == "-n":
            nucleotide = int(sys.argv[i+1])
        elif sys.argv[i] == "-c":
            compare_check = 1
        elif sys.argv[i] == "-o":
            tree_check = 1
        elif sys.argv[i] == "-b":
            break_rate = 1/(float(sys.argv[i+1]))
        elif (sys.argv[i] == "-h") or (sys.argv[i] == "--help"):
            print "Arguments to coal_recombAncestral.py as follows:"
            print "-N\tNumber of leaves (default 100)"
            print "-M\tMutation rate (default 0.1)"
            print "-G\tGenome length (default 1000)"
            print "-R\tRecombination rate (default 10.0)"
            print "-D\tWrite DOT file <file name> (default no file written)"
            print "-n\tNucleotide of interest for DOT file (default middle nucleotide)"
            print "-c\tCompare marginal trees to find detected recombination points (default FALSE)"
            print "-o\tCalculate marginal trees and simulate genomes (default FALSE)"
            print "-b\tAverage recombinant break length (default 50)"
            sys.exit()


    print ("Number of leaves: " + str(num_leaves))
    print ("Mutation rate: " + str(mutation_rate))
    print ("Genome length: " + str(genome_length))
    print ("Recombination rate: " + str(recomb_rate))
    print ("Average recombination break length: " + str(1/break_rate))

    if dot_check == 1:
        print ("Writing DOT file...")
        if 'nucleotide' in locals():
            print ("Nucleotide of interest: " + str(nucleotide))
        else:
            nucleotide = int(math.floor((genome_length-1) / 2))
            print ("Nucleotide of interest: " + str(nucleotide) + "(default)")

        #Write DOT file header
        dot_file  = open(dot_fileName,"w")
        dot_file.write('digraph asde91 {fontsize=5;ranksep="0.02";ratio=fill;size="10,10";\n')
        dot_file.write('edge[arrowhead=none];\n')
        dot_file.write('{rank=same;')
        for i in range(num_leaves):
            dot_file.write('%d[shape=point] ' % i)
        dot_file.write('}\n')

    if dot_check == 1:
        graph_nodes = []
        graph_edges = []
        for i in range(num_leaves):
            graph_nodes.append(str(i) + '[shape=point,width=0.00,height=0.00]\n')

    #File to store recombination points
    recomb_file = open("recombination_points.log","w")

    #Define Node class
    class Node:
        def __init__(self):
            self.age = 0.0
            self.children = []
            self.parent = []
            self.ancestral_material = []
            self.ancestral = 0
            self.label = ''
            self.sequence = ''

    #Define initial leaf nodes
    node_list = []
    for i in range(num_leaves):
        node_list.append(Node())
        node_list[i].ancestral_material = [[0,genome_length-1]]
        node_list[i].ancestral = 1
        node_list[i].label = str(i)
        node_list[i].number = i

    live_nodeList = []
    for i in range(num_leaves):
        live_nodeList.append(i)

    new_node = num_leaves
    current_time = 0.0

    #########################################
    # Simulate coalescent-recombination graph
    #########################################

    num_rejected = 0
    num_accepted = 0

    while len(live_nodeList) > 1:

        #Simulate time to next event
        current_time = current_time - 2 * np.log(1 - np.random.rand(1)) / (len(live_nodeList) * (len(live_nodeList) - 1 + recomb_rate))

	    #Determine whether event is coalescent or recombination
        reac_rand = np.random.rand(1)
        #print (str(reac_rand[0]) + " " + str(float(len(live_nodeList) - 1) / (len(live_nodeList) - 1 + recomb_rate)) + " " + str((len(live_nodeList) + recomb_rate)/2))
        if reac_rand[0] < (float(len(live_nodeList) - 1) / (len(live_nodeList) - 1 + recomb_rate)):
            #Coalescent event occurs

            #Choose two children nodes to coalesce
            children = random.sample(live_nodeList,2)
            #Set the parent of the two children
            node_list[children[0]].parent.append(new_node)
            node_list[children[1]].parent.append(new_node)

            #Make new parent node
            node_list.append(Node())
            node_list[new_node].children = children
            node_list[new_node].age = current_time
            node_list[new_node].number = new_node

            # Create the combined parental ancestral material
            # Combine the two ancestries into one list
            # Call function to combine two parts of ancestral material
            new_ancestry = combine_ancestralMaterial(node_list[children[0]].ancestral_material,node_list[children[1]].ancestral_material)
            node_list[new_node].ancestral_material = new_ancestry[:]

            if dot_check == 1:
                # Check if new parent is ancestral for nucleotide of interest
                ancestral = 0
                for i in range(len(node_list[new_node].ancestral_material)):
                    if (nucleotide >= node_list[new_node].ancestral_material[i][0]) and (nucleotide <= node_list[new_node].ancestral_material[i][1]):
                        ancestral = 1
                        break
                if ancestral == 1:
                    node_list[new_node].ancestral = 1

                [graph_nodes,graph_edges] = DOT_coal(node_list[new_node].ancestral,new_node,children,node_list[children[0]].ancestral,node_list[children[1]].ancestral,graph_nodes,graph_edges)

            temp_liveNodes = []
            temp_liveNodes.append(new_node)
            for i in live_nodeList:
                if (i != children[0]) and (i != children[1]):
                    temp_liveNodes.append(i)
            live_nodeList = temp_liveNodes[:]

            new_node = new_node + 1

        else:
            #Recombination event ocurrs
            #Choose a child at random
            child = random.sample(live_nodeList,1)
            #Choose break point based on ancestral material of the child
            break_start = int(random.sample(range(genome_length),1)[0])
            break_length = np.floor(np.log(np.random.rand(1))/np.log(1-break_rate))[0]
            if break_length >= genome_length:
                ancestral_check = 0
            else:
                break_end = break_start + int(break_length)
                break_end = int(int(break_end) % genome_length)

                ancestral_check = 1

                if break_start <= break_end:
                    #Recombinant break does not go over the 'end' of the genome
                    #Do not perform recombinant reaction if recombinant break is outside of ancestral material
                    if (node_list[child[0]].ancestral_material[0][0] != 0) or (node_list[child[0]].ancestral_material[-1][1] != genome_length-1):

                        if (break_start <= node_list[child[0]].ancestral_material[0][0]) and (break_end >= node_list[child[0]].ancestral_material[-1][1]):
                            ancestral_check = 0

                        elif (break_start <= node_list[child[0]].ancestral_material[0][0]) and (break_end <= node_list[child[0]].ancestral_material[0][0]):
                            ancestral_check = 0

                        elif (break_start >= node_list[child[0]].ancestral_material[-1][1]) and (break_end >= node_list[child[0]].ancestral_material[-1][1]):
                            ancestral_check = 0

                        else:
                            for i in range(len(node_list[child[0]].ancestral_material)-1):
                                if (break_start > node_list[child[0]].ancestral_material[i][1]) and (break_end < node_list[child[0]].ancestral_material[i+1][0]):
                                    ancestral_check = 0
                                    break
                    else:
                        for i in range(len(node_list[child[0]].ancestral_material)-1):
                            if (break_start > node_list[child[0]].ancestral_material[i][1]) and (break_end < node_list[child[0]].ancestral_material[i+1][0]):
                                ancestral_check = 0
                                break
                else:
                    if (node_list[child[0]].ancestral_material[0][0] != 0) or (node_list[child[0]].ancestral_material[-1][1] != genome_length-1):

                        if (break_start <= node_list[child[0]].ancestral_material[0][0]) and (break_end <= node_list[child[0]].ancestral_material[0][0]):
                            ancestral_check = 0

                        elif (break_start >= node_list[child[0]].ancestral_material[-1][1]) and (break_end >= node_list[child[0]].ancestral_material[-1][1]):
                            ancestral_check = 0

                        elif (break_start >= node_list[child[0]].ancestral_material[-1][1]) and (break_end <= node_list[child[0]].ancestral_material[0][0]):
                            ancestral_check = 0

                        else:
                            for i in range(len(node_list[child[0]].ancestral_material)-1):
                                if (break_end >= node_list[child[0]].ancestral_material[i][1]) and (break_start <= node_list[child[0]].ancestral_material[i+1][0]):
                                    ancestral_check = 0
                                    break

                    else:
                        for i in range(len(node_list[child[0]].ancestral_material)-1):
                            if (break_end >= node_list[child[0]].ancestral_material[i][1]) and (break_start <= node_list[child[0]].ancestral_material[i+1][0]):
                                ancestral_check = 0
                                break




            #If recombinant break does not split up ancestral material, ignore recombination event
            if ancestral_check == 1:
                num_accepted = num_accepted + 1
                #Write accepted recombination point to file
                recomb_file.write("%d\t%d\n" % (break_start,break_end))

                #Set the two parents
                node_list[child[0]].parent = [new_node,new_node+1]

                #Make new parent nodes
                node_list.append(Node())
                node_list.append(Node())
                node_list[new_node].children.append(child[0])
                node_list[new_node+1].children.append(child[0])
                node_list[new_node].age = current_time
                node_list[new_node+1].age = current_time
                node_list[new_node].number = new_node
                node_list[new_node+1].number = new_node+1

                # Split ancestral material into two parents
                # Call function to split ancestral material
                [node_list[new_node].ancestral_material,node_list[new_node+1].ancestral_material] = split_ancestry(node_list[child[0]].ancestral_material,break_start,break_end,genome_length)

                if dot_check == 1:
                    #Check if two parents are ancestral to nucleotide of interest
                    ancestral = 0
                    for i in range(len(node_list[new_node].ancestral_material)):
                        if (nucleotide >= node_list[new_node].ancestral_material[i][0]) and (nucleotide <= node_list[new_node].ancestral_material[i][1]):
                            ancestral = 1
                            break
                    if ancestral == 1:
                        node_list[new_node].ancestral = 1
                    ancestral = 0
                    for i in range(len(node_list[new_node+1].ancestral_material)):
                        if (nucleotide >= node_list[new_node+1].ancestral_material[i][0]) and (nucleotide <= node_list[new_node+1].ancestral_material[i][1]):
                            ancestral = 1
                            break
                    if ancestral == 1:
                        node_list[new_node+1].ancestral = 1

                    #Call function to write recombination nodes and edges to file
                    [graph_nodes,graph_edges] = DOT_recomb(new_node,new_node+1,child[0],node_list[child[0]].ancestral,node_list[new_node].ancestral,node_list[new_node+1].ancestral,graph_nodes,graph_edges)

                if (len(node_list[new_node].ancestral_material) == 0) or (len(node_list[new_node+1].ancestral_material) == 0):
                    print ("Child: "  +str(node_list[child[0]].ancestral_material))
                    print ("Break start: " + str(break_start) + " Break end: " + str(break_end))
                    print ("Recombination parent 1: " + str(node_list[new_node].ancestral_material))
                    print ("Recombination parent 2: " + str(node_list[new_node+1].ancestral_material))
                    sys.exit()

                temp_liveNodes = []
                temp_liveNodes.append(new_node)
                temp_liveNodes.append(new_node+1)
                for i in live_nodeList:
                    if (i != child[0]):
                        temp_liveNodes.append(i)
                live_nodeList = temp_liveNodes[:]

                new_node = new_node + 2
            else:
                num_rejected = num_rejected + 1
            #END if
        #END WHILE


    recomb_file.close()

    print num_accepted
    print num_rejected

    #for i in range(len(node_list)):
    #    print node_list[i].ancestral_material

    if dot_check == 1:
        for i in range(len(graph_nodes)):
            dot_file.write(str(graph_nodes[i]))
        for i in range(len(graph_edges)):
            dot_file.write(str(graph_edges[i]))
        dot_file.write('}')
        dot_file.close()

    #Check for any nucleotides not present in the ancestral material
    #of the root node
    for i in range(genome_length):
        check = 0
        for j in range(len(node_list[len(node_list)-1].ancestral_material)):
            if (i >= node_list[len(node_list)-1].ancestral_material[j][0]) and (i <= node_list[len(node_list)-1].ancestral_material[j][1]):
                check = 1
        if check == 0:
            print ("Error, nucleotide: " + str(i))
            print ("Root ancestry: " + str(node_list[len(node_list)-1].ancestral_material))
            sys.exit()


    #########################################
    # Create clonal trees for each nucleotide
    #########################################

    if tree_check == 1:
        clonal_trees = []
        #Create tree for each nucleotide
        tree_file = open('trees.nwk','w')
        #for nuc in range(genome_length):
        print "Calculating marginal trees and simulating sequences..."
        for nuc in range(genome_length):
            print nuc
            #Copy node list
            temp_list = []
            temp_list = deepcopy(node_list)
            #Create a list of all clonal nodes
            clonal_list = []
            clonal_list = create_initialList(temp_list,nuc)

            #Remove un-ancestral parents and children
            clonal_list = remove_unancestral(temp_list,clonal_list)

            #Remove recombinant nodes
            [temp_list,clonal_list] = remove_recomb(temp_list,clonal_list)

            #Remove last node if recombination causes elongation of the tree
            if len(temp_list[clonal_list[-1]].children) == 1:
                clonal_list = clonal_list[0:-1]

            #Set last node as root
            temp_list[clonal_list[-1]].parent = []

            # Calculate Newick tree for nucleotide and write to file
            newick_tree = []
            newick_tree = create_newick(temp_list,clonal_list,num_leaves)
            newick_tree = newick_tree + ";"

            tree_file.write("[%d]%s\n" % (nuc,newick_tree))
            clonal_trees.append(newick_tree)

            #Simulate nucleotide down clonal frame
            node_list = simulate_nuc(nuc,node_list,temp_list,clonal_list,mutation_rate,num_leaves)

        sequence_file = open("genomes.fasta","w")
        for i in range(num_leaves):
            sequence_file.write(">%d\n" % i)
            sequence_file.write("%s\n" % node_list[i].sequence)
        sequence_file.close()
        if compare_check == 1:
            print "Comparing marginal trees..."
            tree_comparisons = np.zeros((genome_length,genome_length))
            for i in range(genome_length):
                print i
                for j in range(genome_length):
                    tree_comparisons[i][j] = compare_trees(clonal_trees[i],clonal_trees[j])
            #Plot heat map of distance metric
            img = plt.pcolor(tree_comparisons)
            plt.colorbar(img)
            plt.show(img)

        tree_file.close()