def gen_tree(n, Beta, Alpha, Delta, Output, cn_num, del_rate, min_cn_size, exp_theta, amp_p, template_ref, outfile, fa_prefix, snv_rate, root_mult, whole_amp, whole_amp_rate, whole_amp_num, amp_num_geo_par): #n = 4 #Beta = 0.5 #Alpha = 0.5 #Delta = 0 #Output = "test" #cn_num = 1 #del_rate = 0.5 #min_cn_size = 200000 ## exponential distribution ## smaller exp_theta means larger chance to get larger CNV #exp_theta = 0.000001 ## geometric distribution ## like simulated annealing, lower amp_p means larger chance to get large CN amp #amp_p = 0.5 ##template_ref = "ref.fasta" #template_ref = "/home1/03626/xfan/reference/hg19.fa" #outfile = "/work/03626/xfan/lonestar/std.out" #fa_prefix = "/work/03626/xfan/lonestar/ref" ref_array = [] chr_name_array = [] chr_sz = [] #n = int(raw_input("n:")) #Beta = float(raw_input("beta:")) #Alpha = float(raw_input("alpha:")) #Delta = float(raw_input("delta:")) #Output = raw_input("output file:") #cn_num = int(raw_input("mean copy number:")) #del_rate = float(raw_input("deletion rate [0, 1]:")) #min_cn_size = int(raw_input("minimum copy number size, recommend > 2000000:")) #exp_theta = float(raw_input("parameter for copy number size:")) #amp_p = float(raw_input("parameter for amplification allele #:")) #template_ref = raw_input("template fasta file:") #outfile = raw_input("Output file name:") #fa_f_prefix = raw_input("fasta prefix:") f = open(outfile, "w") #n= int(n) #Alpha = float(Alpha) #Beta = float(Beta) #Delta = float(Delta) # add a root (node 0) to the tree # edge length (there are at most 2*n - 1)) # root # | CN0 # node 0 # / CN1 \ CN2 # node 1 node 2 ti = np.random.exponential(1, 2 * n - 1) #print len(ti) Ui = np.random.uniform(0.0, 1.0, n - 1) Vi = np.random.uniform(0.0, 1.0, n - 1) Di = np.random.uniform(0.0, 1.0, n - 1) Bi = np.random.beta(float(Alpha + 1), float(Beta + 1), n - 1) #Normalizing the branch lengths summation = 0 for t in ti: summation += t for T in range(0, len(ti)): ti[T] = float(ti[T]) / float(summation) #print ti #Contructing the phylogeny # by default chromosome size # from hg19, Navin's 2012 paper #chr_sz = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566, 155270560, 59373566] # root is the node before node 0 in tree root = MyNode("0: [0,1]") root.tuple = [0, 1] ref_array, chr_name_array, chr_sz = init_ref(template_ref) chr_sz1 = [] # data structure for corresponding coordinates for calculating the actual CNV on reference # copy so that the two arrays of allele length are independent corres2 = [] for i in chr_sz: chr_sz1.append(i) corres = corres_coord(0, i, 0, i) corres2.append([corres]) # each corres contains two alleles, each alleles contains all chromosomes, each chromosome contains a list of corres_coord data struture, which has the four tuple of ref1, ref2, gen1, gen2 root.chrlen = [chr_sz1, chr_sz1] root.corres = [corres2, corres2] #print chr_sz root.id = -1 Tree = [] Tree.append(MyNode("0: [0,1]")) Tree[0].tuple = [0, 1] Tree[0].id = 0 CN_LIST_ID = 0 # whole chromosome amplification if whole_amp == 1: Tree[0].cn, Tree[0].chrlen, Tree[0].corres = add_whole_amp( root.chrlen, whole_amp_rate, whole_amp_num, root.corres, amp_num_geo_par) # assume most of the CN happens on the root branch cn_array2, Tree[0].chrlen, Tree[0].corres = add_CN( Tree[0].chrlen, (cn_num * root_mult), del_rate, min_cn_size, exp_theta, amp_p, Tree[0].corres, CN_LIST_ID) for x in cn_array2: Tree[0].cn.append(x) else: Tree[0].cn, Tree[0].chrlen, Tree[0].corres = add_CN( root.chrlen, (cn_num * root_mult), del_rate, min_cn_size, exp_theta, amp_p, root.corres, CN_LIST_ID) Tree[0].cn_detail, Tree[0].cn_summary = get_cn_from_corres( Tree[0].corres, chr_sz) #print "Node 0:" #print Tree[0].chrlen Tree[0].parent = root Tree[0].edge_length = np.random.exponential(1, 1) # update the reference on the node #Tree[0].ref = gen_ref(ref_array, Tree[0].cn) #tmp_ref = gen_ref(ref_array, Tree[0].cn) # memory issue, write it to a file #fa_f_prefix = fa_prefix + str(0) + "_" #write_ref(tmp_ref, chr_name_array, fa_f_prefix) #Tree[0].ref, Tree[0].snvs = add_SNV(Tree[0].chrlen, Tree[0].ref, snv_rate, Tree[0].edge_length) Tree.append( MyNode( str(1) + ":[0," + "{0:.2f}".format(Bi[0]) + "]" + "," + "{0:.4f}".format(ti[0]))) Tree.append( MyNode( str(2) + ":[" + "{0:.2f}".format(Bi[0]) + ",1]" + "," + "{0:.4f}".format(ti[1]))) # add copy number Tree[1].cn, Tree[1].chrlen, Tree[1].corres = add_CN( Tree[0].chrlen, cn_num, del_rate, min_cn_size, exp_theta, amp_p, Tree[0].corres, CN_LIST_ID) Tree[1].cn_detail, Tree[1].cn_summary = get_cn_from_corres( Tree[1].corres, chr_sz) #print "Node 1:" #print Tree[1].chrlen Tree[2].cn, Tree[2].chrlen, Tree[2].corres = add_CN( Tree[0].chrlen, cn_num, del_rate, min_cn_size, exp_theta, amp_p, Tree[0].corres, CN_LIST_ID) Tree[2].cn_detail, Tree[2].cn_summary = get_cn_from_corres( Tree[2].corres, chr_sz) #print "Node 2:" #print Tree[2].chrlen # update the reference #Tree[1].ref = gen_ref(Tree[0].ref, Tree[1].cn) #Tree[2].ref = gen_ref(Tree[0].ref, Tree[2].cn) # memory issue. at one time at most 2.5 references, each is 6gb (2 alleles). #parent_ref = read_ref(fa_prefix + str(0) + "_") #tmp_ref = gen_ref(parent_ref, Tree[1].cn) #fa_f_prefix = fa_prefix + str(1) + "_" #write_ref(tmp_ref, chr_name_array, fa_f_prefix) #tmp_ref = gen_ref(parent_ref, Tree[2].cn) #fa_f_prefix = fa_prefix + str(2) + "_" #write_ref(tmp_ref, chr_name_array, fa_f_prefix) Tree[1].parent = Tree[0] Tree[2].parent = Tree[0] # set parent ID Tree[1].parentID = 0 Tree[2].parentID = 0 Tree[1].id = 1 Tree[2].id = 2 Tree[1].tuple = [0, Bi[0]] Tree[2].tuple = [Bi[0], 1] Tree[1].edge_length = ti[0] Tree[2].edge_length = ti[1] #Tree[1].ref, Tree[1].snvs = add_SNV(Tree[1].chrlen, Tree[1].ref, snv_rate, Tree[1].edge_length) #Tree[2].ref, Tree[2].snvs = add_SNV(Tree[2].chrlen, Tree[2].ref, snv_rate, Tree[2].edge_length) node_number = 2 j = 1 while j < n - 1: if Vi[j] < Delta: for tr in Tree: if tr.is_leaf and is_in(Di[j], tr.getTuple()): if (not tr.is_dead): tr.name = tr.name + "*" tr.setDead() break else: for tree in Tree: if tree.is_leaf and is_in( Ui[j], tree.getTuple()) and (not tree.is_dead): # get the reference of this node, as it is a parent of the following two this_id = tree.getID() #parent_ref = read_ref(fa_prefix + str(this_id) + "_") #print "the node from " + str(node_number + 1) + " to " + str(node_number+2) + "s' parent id: " + str(tree.getID()) a, b = tree.getTuple() node_number += 2 #Two new children are born here middle = float(Bi[j]) * float( (float(b) - float(a))) + float(a) Tree.append( MyNode(str(node_number - 1) + ":[" + "{0:.4f}".format(a) + "," + "{0:.4f}".format(middle) + "]" + "," + "{0:.4f}".format(ti[node_number - 1]), parent=tree)) Tree.append( MyNode(str(node_number) + ":[" + "{0:.4f}".format(middle) + "," + "{0:.4f}".format(b) + "]" + "," + "{0:.4f}".format(ti[node_number]), parent=tree)) #The new intervals are assigned here Tree[node_number - 1].tuple = [a, middle] Tree[node_number].tuple = [middle, b] Tree[node_number - 1].edge_length = ti[node_number - 1] Tree[node_number].edge_length = ti[node_number] #add copy number this_chrlen = tree.chrlen[:] #print this_chrlen #print node_number, tree.getID() #print "node " + str(node_number - 1) Tree[node_number - 1].cn, Tree[node_number - 1].chrlen, Tree[node_number - 1].corres = add_CN( this_chrlen, cn_num, del_rate, min_cn_size, exp_theta, amp_p, tree.corres, CN_LIST_ID) Tree[node_number - 1].cn_detail, Tree[node_number - 1].cn_summary = get_cn_from_corres( Tree[node_number - 1].corres, chr_sz) this_chrlen = tree.chrlen[:] #print this_chrlen #print node_number, tree.getID() #print "node " + str(node_number) Tree[node_number].cn, Tree[node_number].chrlen, Tree[ node_number].corres = add_CN(this_chrlen, cn_num, del_rate, min_cn_size, exp_theta, amp_p, tree.corres, CN_LIST_ID) Tree[node_number].cn_detail, Tree[ node_number].cn_summary = get_cn_from_corres( Tree[node_number].corres, chr_sz) this_chrlen = tree.chrlen[:] #print this_chrlen #print node_number, tree.getID() # add reference # memory issue #Tree[node_number-1].ref = gen_ref(tree.ref, Tree[node_number-1].cn) #Tree[node_number].ref = gen_ref(tree.ref, Tree[node_number].cn) # now do not calculate the ref anyway, as it takes lots of hard disk space. Just get the tree with cn, then at the leaf, trace back all the cns up to root, and apply it to each leaf. This will solve both the memory and hard disk issue. #tmp_ref = gen_ref(parent_ref, Tree[node_number-1].cn) #fa_f_prefix = fa_prefix + str(node_number-1) + "_" #write_ref(tmp_ref, chr_name_array, fa_f_prefix) #tmp_ref = gen_ref(parent_ref, Tree[node_number].cn) #fa_f_prefix = fa_prefix + str(node_number) + "_" #write_ref(tmp_ref, chr_name_array, fa_f_prefix) # set parent id Tree[node_number].parentID = this_id Tree[node_number - 1].parentID = this_id # add snvs #Tree[node_number-1].ref, Tree[node_number-1].snvs = add_SNV(Tree[node_number-1].chrlen, Tree[node_number-1].ref, snv_rate, Tree[node_number-1].edge_length) #Tree[node_number].ref, Tree[node_number].snvs = add_SNV(Tree[node_number].chrlen, Tree[node_number].ref, snv_rate, Tree[node_number].edge_length) # set id Tree[node_number - 1].id = node_number - 1 Tree[node_number].id = node_number break j += 1 #Changing names of the leaves #leaf_name=0 #for nd in Tree: # if nd.is_leaf: # nd.name = leaf_name # leaf_name+=1 #for pre, fill, node in RenderTree(Tree[0]): # print("%s%s" % (pre, node.name)) # record the chromosome length for each leaf on the tree leaf_chrlen = [] # record which are leaves leaf_index = [] f.write("Before the tree, chromosomomal length is " + str(root.chrlen) + "\n") for i in range(len(Tree)): f.write("node %d: \n" % i) f.write(" parent = %d\n" % Tree[i].parent.getID()) f.write(" name = " + str(Tree[i].name) + "\n") for i in range(len(Tree)): if Tree[i].is_leaf: leaf_index.append(i) leaf_chrlen.append(Tree[i].chrlen) cn = Tree[i].cn f.write("node %d from %d: total CN # = %d\n" % (i, Tree[i].parent.getID(), len(cn))) for j in range(len(cn)): f.write( " copy number %d: allele: %d, is del: %d, chromosome: %d, position: [%d, %d], amplification #: %d\n" % (j, cn[j].CN_Ale, cn[j].CN_Del, cn[j].CN_chromosome, cn[j].CN_p1, cn[j].CN_p2, cn[j].CN_amp_num)) # write the copy number summary (on the reference coordinate cn_summary = Tree[i].cn_summary for chr in sorted(cn_summary): f.write("At chromosome %s\n" % (chr)) cn_summary_ = cn_summary[chr] for each_summary in sorted(cn_summary_): f.write(" %s, %d\n" % (each_summary, cn_summary_[each_summary])) #snvs = Tree[i].snvs #for j in range(len(snvs)): #f.write(" snv %d: chr: %d, pos: %d, ref_nuc: %s, new_nuc: %s", snvs[j].chr, snvs[j].pos, snvs[j].nuc, snvs[j].new_nuc) f.write(" " + str(Tree[i].chrlen) + "\n") #print_chr_len(Tree[i].chrlen) #RenderTreeGraph(Tree[0]).to_picture(str(Output)) # generate reference for each leaf # memory issue, already written. #for i in range(len(Tree)): # fa_f_prefix = fa_prefix + str(i) + "_" # write_ref(Tree[i].ref, chr_name_array, fa_f_prefix) f.close() return leaf_chrlen, leaf_index, chr_name_array, Tree
# Step 2. Consider even coverage, use metropolis hasting to sample read count in each bin based on a given point on Lorenz curve. # if first step was skipped, read the previous stored file if skip == 1: print("Skip the first step. Reading ") #abs_path = os.getcwd() leaf_chrlen_f = save_prefix + ".leaf_chrlen.npy" leaf_index_f = save_prefix + ".leaf_index.npy" chr_name_array_f = save_prefix + ".chr_name_array.npy" tree_f = save_prefix + ".tree.npy" #ref_f = save_prefix + ".ref.npy" leaf_chrlen = numpy.load(leaf_chrlen_f) leaf_index = numpy.load(leaf_index_f) chr_name_array = numpy.load(chr_name_array_f) tree = numpy.load(tree_f) [ref, tmp_chr_name, tmp_len_chr] = init_ref(template_ref) #ref = numpy.load(ref_f) #print(leaf_chrlen_f) #print(leaf_index_f) #print(chr_name_array_f) # make it either making a tree, or generating the leaves [Alpha, Beta] = get_beta_dist(x0, y0) index = 0 leaf_index_ = 0 print("Alpha = %.2f, Beta = %.2f", Alpha, Beta) print("Number of processes: " + str(NUM_OF_PROCESSES)) print("Number of leaves: " + str(n)) # each leaf processes = [] for all_chrlen in leaf_chrlen:
for T in range(0, len(ti)): ti[T] = float(ti[T]) / float(summation) #print ti #Contructing the phylogeny # by default chromosome size # from hg19, Navin's 2012 paper #chr_sz = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566, 155270560, 59373566] # root is the node before node 0 in tree root = MyNode("0: [0,1]") root.tuple = [0, 1] ref_array, chr_name_array, chr_sz = init_ref(template_ref) chr_sz1 = [] # copy so that the two arrays of allele length are independent for i in chr_sz: chr_sz1.append(i) root.chrlen = [chr_sz, chr_sz1] #print chr_sz root.id = -1 Tree = [] Tree.append(MyNode("0: [0,1]")) Tree[0].tuple = [0, 1] Tree[0].id = 0 # assume most of the CN happens on the root branch Tree[0].cn, Tree[0].chrlen = add_CN(root.chrlen, (cn_num * 4), del_rate, min_cn_size, exp_theta, amp_p)