Example #1
0
def gen_tree(n, Beta, Alpha, Delta, Output, cn_num, del_rate, min_cn_size,
             exp_theta, amp_p, template_ref, outfile, fa_prefix, snv_rate,
             root_mult, whole_amp, whole_amp_rate, whole_amp_num,
             amp_num_geo_par):
    #n = 4
    #Beta = 0.5
    #Alpha = 0.5
    #Delta = 0
    #Output = "test"
    #cn_num = 1
    #del_rate = 0.5
    #min_cn_size = 200000
    ## exponential distribution
    ## smaller exp_theta means larger chance to get larger CNV
    #exp_theta = 0.000001
    ## geometric distribution
    ## like simulated annealing, lower amp_p means larger chance to get large CN amp
    #amp_p = 0.5
    ##template_ref = "ref.fasta"
    #template_ref = "/home1/03626/xfan/reference/hg19.fa"
    #outfile = "/work/03626/xfan/lonestar/std.out"
    #fa_prefix = "/work/03626/xfan/lonestar/ref"

    ref_array = []
    chr_name_array = []
    chr_sz = []
    #n = int(raw_input("n:"))
    #Beta = float(raw_input("beta:"))
    #Alpha = float(raw_input("alpha:"))
    #Delta = float(raw_input("delta:"))
    #Output = raw_input("output file:")
    #cn_num = int(raw_input("mean copy number:"))
    #del_rate = float(raw_input("deletion rate [0, 1]:"))
    #min_cn_size = int(raw_input("minimum copy number size, recommend > 2000000:"))
    #exp_theta = float(raw_input("parameter for copy number size:"))
    #amp_p = float(raw_input("parameter for amplification allele #:"))
    #template_ref = raw_input("template fasta file:")
    #outfile = raw_input("Output file name:")
    #fa_f_prefix = raw_input("fasta prefix:")

    f = open(outfile, "w")

    #n= int(n)
    #Alpha = float(Alpha)
    #Beta = float(Beta)
    #Delta = float(Delta)
    # add a root (node 0) to the tree
    # edge length (there are at most 2*n - 1))
    #           root
    #            | CN0
    #          node 0
    #        / CN1   \ CN2
    #    node 1    node 2
    ti = np.random.exponential(1, 2 * n - 1)
    #print len(ti)
    Ui = np.random.uniform(0.0, 1.0, n - 1)
    Vi = np.random.uniform(0.0, 1.0, n - 1)
    Di = np.random.uniform(0.0, 1.0, n - 1)
    Bi = np.random.beta(float(Alpha + 1), float(Beta + 1), n - 1)

    #Normalizing the branch lengths
    summation = 0
    for t in ti:
        summation += t

    for T in range(0, len(ti)):
        ti[T] = float(ti[T]) / float(summation)

    #print ti

    #Contructing the phylogeny
    # by default chromosome size
    # from hg19, Navin's 2012 paper
    #chr_sz = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566, 155270560, 59373566]

    # root is the node before node 0 in tree

    root = MyNode("0: [0,1]")
    root.tuple = [0, 1]
    ref_array, chr_name_array, chr_sz = init_ref(template_ref)
    chr_sz1 = []
    # data structure for corresponding coordinates for calculating the actual CNV on reference
    # copy so that the two arrays of allele length are independent
    corres2 = []
    for i in chr_sz:
        chr_sz1.append(i)
        corres = corres_coord(0, i, 0, i)
        corres2.append([corres])
        # each corres contains two alleles, each alleles contains all chromosomes, each chromosome contains a list of corres_coord data struture, which has the four tuple of ref1, ref2, gen1, gen2
    root.chrlen = [chr_sz1, chr_sz1]
    root.corres = [corres2, corres2]
    #print chr_sz
    root.id = -1

    Tree = []
    Tree.append(MyNode("0: [0,1]"))
    Tree[0].tuple = [0, 1]
    Tree[0].id = 0
    CN_LIST_ID = 0
    # whole chromosome amplification
    if whole_amp == 1:
        Tree[0].cn, Tree[0].chrlen, Tree[0].corres = add_whole_amp(
            root.chrlen, whole_amp_rate, whole_amp_num, root.corres,
            amp_num_geo_par)
        # assume most of the CN happens on the root branch
        cn_array2, Tree[0].chrlen, Tree[0].corres = add_CN(
            Tree[0].chrlen, (cn_num * root_mult), del_rate, min_cn_size,
            exp_theta, amp_p, Tree[0].corres, CN_LIST_ID)
        for x in cn_array2:
            Tree[0].cn.append(x)
    else:
        Tree[0].cn, Tree[0].chrlen, Tree[0].corres = add_CN(
            root.chrlen, (cn_num * root_mult), del_rate, min_cn_size,
            exp_theta, amp_p, root.corres, CN_LIST_ID)
    Tree[0].cn_detail, Tree[0].cn_summary = get_cn_from_corres(
        Tree[0].corres, chr_sz)
    #print "Node 0:"
    #print Tree[0].chrlen
    Tree[0].parent = root
    Tree[0].edge_length = np.random.exponential(1, 1)

    # update the reference on the node
    #Tree[0].ref = gen_ref(ref_array, Tree[0].cn)
    #tmp_ref = gen_ref(ref_array, Tree[0].cn)
    # memory issue, write it to a file
    #fa_f_prefix = fa_prefix + str(0) + "_"
    #write_ref(tmp_ref, chr_name_array, fa_f_prefix)
    #Tree[0].ref, Tree[0].snvs = add_SNV(Tree[0].chrlen, Tree[0].ref, snv_rate, Tree[0].edge_length)

    Tree.append(
        MyNode(
            str(1) + ":[0," + "{0:.2f}".format(Bi[0]) + "]" + "," +
            "{0:.4f}".format(ti[0])))
    Tree.append(
        MyNode(
            str(2) + ":[" + "{0:.2f}".format(Bi[0]) + ",1]" + "," +
            "{0:.4f}".format(ti[1])))
    # add copy number
    Tree[1].cn, Tree[1].chrlen, Tree[1].corres = add_CN(
        Tree[0].chrlen, cn_num, del_rate, min_cn_size, exp_theta, amp_p,
        Tree[0].corres, CN_LIST_ID)
    Tree[1].cn_detail, Tree[1].cn_summary = get_cn_from_corres(
        Tree[1].corres, chr_sz)
    #print "Node 1:"
    #print Tree[1].chrlen
    Tree[2].cn, Tree[2].chrlen, Tree[2].corres = add_CN(
        Tree[0].chrlen, cn_num, del_rate, min_cn_size, exp_theta, amp_p,
        Tree[0].corres, CN_LIST_ID)
    Tree[2].cn_detail, Tree[2].cn_summary = get_cn_from_corres(
        Tree[2].corres, chr_sz)
    #print "Node 2:"
    #print Tree[2].chrlen

    # update the reference
    #Tree[1].ref = gen_ref(Tree[0].ref, Tree[1].cn)
    #Tree[2].ref = gen_ref(Tree[0].ref, Tree[2].cn)
    # memory issue. at one time at most 2.5 references, each is 6gb (2 alleles).
    #parent_ref = read_ref(fa_prefix + str(0) + "_")
    #tmp_ref = gen_ref(parent_ref, Tree[1].cn)
    #fa_f_prefix = fa_prefix + str(1) + "_"
    #write_ref(tmp_ref, chr_name_array, fa_f_prefix)
    #tmp_ref = gen_ref(parent_ref, Tree[2].cn)
    #fa_f_prefix = fa_prefix + str(2) + "_"
    #write_ref(tmp_ref, chr_name_array, fa_f_prefix)

    Tree[1].parent = Tree[0]
    Tree[2].parent = Tree[0]
    # set parent ID
    Tree[1].parentID = 0
    Tree[2].parentID = 0
    Tree[1].id = 1
    Tree[2].id = 2
    Tree[1].tuple = [0, Bi[0]]
    Tree[2].tuple = [Bi[0], 1]
    Tree[1].edge_length = ti[0]
    Tree[2].edge_length = ti[1]
    #Tree[1].ref, Tree[1].snvs = add_SNV(Tree[1].chrlen, Tree[1].ref, snv_rate, Tree[1].edge_length)
    #Tree[2].ref, Tree[2].snvs = add_SNV(Tree[2].chrlen, Tree[2].ref, snv_rate, Tree[2].edge_length)

    node_number = 2
    j = 1

    while j < n - 1:
        if Vi[j] < Delta:
            for tr in Tree:
                if tr.is_leaf and is_in(Di[j], tr.getTuple()):
                    if (not tr.is_dead):
                        tr.name = tr.name + "*"
                    tr.setDead()
                    break
        else:
            for tree in Tree:
                if tree.is_leaf and is_in(
                        Ui[j], tree.getTuple()) and (not tree.is_dead):
                    # get the reference of this node, as it is a parent of the following two
                    this_id = tree.getID()
                    #parent_ref = read_ref(fa_prefix + str(this_id) + "_")

                    #print "the node from " + str(node_number + 1) + " to " + str(node_number+2) + "s' parent id: " + str(tree.getID())
                    a, b = tree.getTuple()
                    node_number += 2
                    #Two new children are born here
                    middle = float(Bi[j]) * float(
                        (float(b) - float(a))) + float(a)
                    Tree.append(
                        MyNode(str(node_number - 1) + ":[" +
                               "{0:.4f}".format(a) + "," +
                               "{0:.4f}".format(middle) + "]" + "," +
                               "{0:.4f}".format(ti[node_number - 1]),
                               parent=tree))
                    Tree.append(
                        MyNode(str(node_number) + ":[" +
                               "{0:.4f}".format(middle) + "," +
                               "{0:.4f}".format(b) + "]" + "," +
                               "{0:.4f}".format(ti[node_number]),
                               parent=tree))

                    #The new intervals are assigned here
                    Tree[node_number - 1].tuple = [a, middle]
                    Tree[node_number].tuple = [middle, b]
                    Tree[node_number - 1].edge_length = ti[node_number - 1]
                    Tree[node_number].edge_length = ti[node_number]
                    #add copy number
                    this_chrlen = tree.chrlen[:]
                    #print this_chrlen
                    #print node_number, tree.getID()
                    #print "node " + str(node_number - 1)
                    Tree[node_number -
                         1].cn, Tree[node_number -
                                     1].chrlen, Tree[node_number -
                                                     1].corres = add_CN(
                                                         this_chrlen, cn_num,
                                                         del_rate, min_cn_size,
                                                         exp_theta, amp_p,
                                                         tree.corres,
                                                         CN_LIST_ID)
                    Tree[node_number -
                         1].cn_detail, Tree[node_number -
                                            1].cn_summary = get_cn_from_corres(
                                                Tree[node_number - 1].corres,
                                                chr_sz)
                    this_chrlen = tree.chrlen[:]
                    #print this_chrlen
                    #print node_number, tree.getID()
                    #print "node " + str(node_number)
                    Tree[node_number].cn, Tree[node_number].chrlen, Tree[
                        node_number].corres = add_CN(this_chrlen, cn_num,
                                                     del_rate, min_cn_size,
                                                     exp_theta, amp_p,
                                                     tree.corres, CN_LIST_ID)
                    Tree[node_number].cn_detail, Tree[
                        node_number].cn_summary = get_cn_from_corres(
                            Tree[node_number].corres, chr_sz)
                    this_chrlen = tree.chrlen[:]
                    #print this_chrlen
                    #print node_number, tree.getID()

                    # add reference
                    # memory issue
                    #Tree[node_number-1].ref = gen_ref(tree.ref, Tree[node_number-1].cn)
                    #Tree[node_number].ref = gen_ref(tree.ref, Tree[node_number].cn)
                    # now do not calculate the ref anyway, as it takes lots of hard disk space. Just get the tree with cn, then at the leaf, trace back all the cns up to root, and apply it to each leaf. This will solve both the memory and hard disk issue.
                    #tmp_ref = gen_ref(parent_ref, Tree[node_number-1].cn)
                    #fa_f_prefix = fa_prefix + str(node_number-1) + "_"
                    #write_ref(tmp_ref, chr_name_array, fa_f_prefix)
                    #tmp_ref = gen_ref(parent_ref, Tree[node_number].cn)
                    #fa_f_prefix = fa_prefix + str(node_number) + "_"
                    #write_ref(tmp_ref, chr_name_array, fa_f_prefix)

                    # set parent id
                    Tree[node_number].parentID = this_id
                    Tree[node_number - 1].parentID = this_id
                    # add snvs
                    #Tree[node_number-1].ref, Tree[node_number-1].snvs = add_SNV(Tree[node_number-1].chrlen, Tree[node_number-1].ref, snv_rate, Tree[node_number-1].edge_length)
                    #Tree[node_number].ref, Tree[node_number].snvs = add_SNV(Tree[node_number].chrlen, Tree[node_number].ref, snv_rate, Tree[node_number].edge_length)

                    # set id
                    Tree[node_number - 1].id = node_number - 1
                    Tree[node_number].id = node_number

                    break

        j += 1

    #Changing names of the leaves
    #leaf_name=0
    #for nd in Tree:
    #    if nd.is_leaf:
    #        nd.name = leaf_name
    #        leaf_name+=1

    #for pre, fill, node in RenderTree(Tree[0]):
    #    print("%s%s" % (pre, node.name))

    # record the chromosome length for each leaf on the tree
    leaf_chrlen = []
    # record which are leaves
    leaf_index = []
    f.write("Before the tree, chromosomomal length is " + str(root.chrlen) +
            "\n")
    for i in range(len(Tree)):
        f.write("node %d: \n" % i)
        f.write("    parent = %d\n" % Tree[i].parent.getID())
        f.write("    name = " + str(Tree[i].name) + "\n")
    for i in range(len(Tree)):
        if Tree[i].is_leaf:
            leaf_index.append(i)
            leaf_chrlen.append(Tree[i].chrlen)
        cn = Tree[i].cn
        f.write("node %d from %d: total CN # = %d\n" %
                (i, Tree[i].parent.getID(), len(cn)))
        for j in range(len(cn)):
            f.write(
                "    copy number %d: allele: %d, is del: %d, chromosome: %d, position: [%d, %d], amplification #: %d\n"
                % (j, cn[j].CN_Ale, cn[j].CN_Del, cn[j].CN_chromosome,
                   cn[j].CN_p1, cn[j].CN_p2, cn[j].CN_amp_num))
        # write the copy number summary (on the reference coordinate
        cn_summary = Tree[i].cn_summary
        for chr in sorted(cn_summary):
            f.write("At chromosome %s\n" % (chr))
            cn_summary_ = cn_summary[chr]
            for each_summary in sorted(cn_summary_):
                f.write("   %s, %d\n" %
                        (each_summary, cn_summary_[each_summary]))
        #snvs = Tree[i].snvs
        #for j in range(len(snvs)):
        #f.write("    snv %d: chr: %d, pos: %d, ref_nuc: %s, new_nuc: %s", snvs[j].chr, snvs[j].pos, snvs[j].nuc, snvs[j].new_nuc)

        f.write("    " + str(Tree[i].chrlen) + "\n")
        #print_chr_len(Tree[i].chrlen)
    #RenderTreeGraph(Tree[0]).to_picture(str(Output))

    # generate reference for each leaf
    # memory issue, already written.
    #for i in range(len(Tree)):
    #    fa_f_prefix = fa_prefix + str(i) + "_"
    #    write_ref(Tree[i].ref, chr_name_array, fa_f_prefix)

    f.close()
    return leaf_chrlen, leaf_index, chr_name_array, Tree
# Step 2. Consider even coverage, use metropolis hasting to sample read count in each bin based on a given point on Lorenz curve.

# if first step was skipped, read the previous stored file
if skip == 1:
    print("Skip the first step. Reading ")
    #abs_path = os.getcwd()
    leaf_chrlen_f = save_prefix + ".leaf_chrlen.npy"
    leaf_index_f = save_prefix + ".leaf_index.npy"
    chr_name_array_f = save_prefix + ".chr_name_array.npy"
    tree_f = save_prefix + ".tree.npy"
    #ref_f = save_prefix + ".ref.npy"
    leaf_chrlen = numpy.load(leaf_chrlen_f)
    leaf_index = numpy.load(leaf_index_f)
    chr_name_array = numpy.load(chr_name_array_f)
    tree = numpy.load(tree_f)
    [ref, tmp_chr_name, tmp_len_chr] = init_ref(template_ref)
    #ref = numpy.load(ref_f)
    #print(leaf_chrlen_f)
    #print(leaf_index_f)
    #print(chr_name_array_f)

    # make it either making a tree, or generating the leaves
    [Alpha, Beta] = get_beta_dist(x0, y0)
    index = 0
    leaf_index_ = 0
    print("Alpha = %.2f, Beta = %.2f", Alpha, Beta)
    print("Number of processes: " + str(NUM_OF_PROCESSES))
    print("Number of leaves: " + str(n))
    # each leaf
    processes = []
    for all_chrlen in leaf_chrlen:
Example #3
0
for T in range(0, len(ti)):
    ti[T] = float(ti[T]) / float(summation)

#print ti

#Contructing the phylogeny
# by default chromosome size
# from hg19, Navin's 2012 paper
#chr_sz = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566, 155270560, 59373566]

# root is the node before node 0 in tree

root = MyNode("0: [0,1]")
root.tuple = [0, 1]
ref_array, chr_name_array, chr_sz = init_ref(template_ref)
chr_sz1 = []
# copy so that the two arrays of allele length are independent
for i in chr_sz:
    chr_sz1.append(i)
root.chrlen = [chr_sz, chr_sz1]
#print chr_sz
root.id = -1

Tree = []
Tree.append(MyNode("0: [0,1]"))
Tree[0].tuple = [0, 1]
Tree[0].id = 0
# assume most of the CN happens on the root branch
Tree[0].cn, Tree[0].chrlen = add_CN(root.chrlen, (cn_num * 4), del_rate,
                                    min_cn_size, exp_theta, amp_p)