def return_upgma(seq_list, names_list, df, cfd_dict=None):
    '''input:  a list of names and a list of sequences, calibrated
	output: an upgma instance.
	'''
    if df == Metric.CRISTA:
        base = list(map(lambda sg: sg[3:-6], seq_list))  # seq_list
        seq_list = list(
            map(lambda t: Metric.pos_in_metric_general(t, df, base, cfd_dict),
                seq_list))
        df = Metric.find_dist_np

    if df == Metric.cfd_funct:  # to uncomment
        #base = random.sample(seq_list, int(math.log(len(seq_list))))
        #base = random.sample(seq_list, 80)
        #base = random.sample(seq_list, int(len(seq_list)**0.9))
        #base = seq_list
        #seq_list = list(map(lambda t: Metric.pos_in_metric_general(t,df,base, cfd_dict), seq_list))
        seq_list = list(
            map(lambda t: Metric.pos_in_metric_cfd_np(t, cfd_dict),
                seq_list))  #to uncomment

        #	df = Metric.find_dist_t  #if prev line is not  is use #to uncomment
        df = Metric.find_dist_np
    matrix = UPGMA.make_initiale_matrix(df, seq_list)
    m2 = UPGMA.make_distance_matrix(
        names_list,
        matrix)  #shuold be m2 = UPGMA.make_distance_matrix(names_list, matrix)
    upgma1 = UPGMA.make_UPGMA(m2)
    return upgma1
예제 #2
0
def return_upgma(seq_list, names_list, df):
 '''input:  a list of names and a list of sequences, calibrated
  output: an upgma instance.
   '''
 matrix = UPGMA.make_initiale_matrix(df,seq_list)
 m2 = UPGMA.make_distance_matrix(names_list, matrix)
 #m3 = m2.__repr__()
 upgma1 = UPGMA.make_UPGMA(m2)
 return upgma1
예제 #3
0
def test_compre_to_shirans():
	Nuc = ["A", "G", "C", "T"]

	seq_lst = [ ''.join(random.choice(Nuc) for _ in range(20)) for i in range(200)]
	for s1 in seq_lst:
		for s2 in seq_lst:
			if cfd_funct(s1, s2) != UPGMA.cfd_func(s1, s2):
				print("different: ", s1, s2)
				print("scores :",cfd_funct(s1, s2), UPGMA.cfd_func(s1, s2))
예제 #4
0
    def Load_File_Internal(self, tfile):
        """Load file with error checking"""
        self.aligned_file = tfile.replace(
            '/', os.sep)  #the file to find the protein distance matrix
        af = self.aligned_file
        index, slash = af.rfind('.'), af.rfind(os.sep)
        if (index == -1 or index < slash):
            index = len(af)
        self.distance_file = af[0:index] + ".dst"
        print "trying to open treesystem with aligned %s distance %s" % (
            self.aligned_file, self.distance_file)
        if not os.path.isfile(self.aligned_file):
            print "no aligned file %s found" % (self.aligned_file)

        try:
            print 'treesystem opening %s' % (self.aligned_file)
            #Sequencer.Clustalw_Protein(self.aligned_file)     #Create protein distance matrix
            print "sending %s off to UPGMA" % (self.distance_file)
            self.tree = UPGMA.UPGMA(
                self.distance_file)  #root tree is the base tree
        except IOError:
            print "File must be a FASTA formatted file with more than one sequence"
            return
        self.Grab_Info(
        )  #Create a dictionary by GI of the titles and sequences
예제 #5
0
def test2_compre_to_shirans():
	t1 = "ACGTACGTACGTACGTACGG"
	t2 = "GCGTACGTACGTACGTACGG"

	s1, s2 = 1-cfd_funct(t1, t2), 1- UPGMA.cfd_func(t1, t2)

	print(s1, s2)
예제 #6
0
 def Make_Tree(self, file):
     if (len(self.gi_list) > 0):
         self.tree = UPGMA.UPGMAFiltered(file, self.gi_list)
     else:
         self.tree = TreeParse.Tree(0)
     self.changed = 0
     return self.tree
def test_fill_distance():
    a = "aret"
    b = "ardw"
    c = "brdw"
    seq_list = [a, b, c]
    names = ["a", "b", "c"]
    matrix = UPGMA.make_initiale_matrix(UPGMA.p_distance, seq_list)
    m2 = UPGMA.make_distance_matrix(names, matrix)
    print("names")
    print(m2.names)
    m3 = m2.__repr__()
    upgma1 = UPGMA.make_UPGMA(m2)
    fill_leaves_sets(upgma1)
    fill_distance_from_leaves(upgma1)
    node = list(upgma1.root.leaves_DS)[0]
    while (node):
        node = node.parent
예제 #8
0
파일: Stage2.py 프로젝트: shiranab/CRISPys
def return_upgma(seq_list, names_list, df, cfd_dict=None):
    '''input:  a list of names and a list of sequences, calibrated
	output: an upgma instance.
	'''
    if df == Metric.cfd_funct:
        #base = seq_list
        #metric_seq_list = list(map(lambda t: Metric.pos_in_metric_general(t,df,base, cfd_dict), seq_list))

        seq_list = list(
            map(lambda t: Metric.pos_in_metric_cfd_np(t, cfd_dict),
                seq_list))  #to uncomment

        #	df = Metric.find_dist_t  #if prev line is not  is use #to uncomment
        df = Metric.find_dist_np
    matrix = UPGMA.make_initiale_matrix(df, seq_list)
    m2 = UPGMA.make_distance_matrix(
        names_list,
        matrix)  #shuold be m2 = UPGMA.make_distance_matrix(names_list, matrix)
    upgma1 = UPGMA.make_UPGMA(m2)
    return upgma1
예제 #9
0
def main(arg1, arg2):
    start_time = time.time()
    with open(arg1) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content]
    ###print(content)
    if (arg2 == "common"):
        leaf_lists = {}
        for i in range(len(content)):
            t1 = Tree(content[i])
            leaf_lists[i] = []
            for leaf in t1:
                leaf_lists[i].append(leaf.name)
        ##print(leaf_lists)
        distance_mat = []
        for x in range(0, len(content)):
            distance_mat.append([])
            for y in range(0, x):
                lev_dist = len(intersection(leaf_lists[x], leaf_lists[y]))
                distance_mat[x].append(lev_dist)
        #.pop(0)
        M_labels = number_labels(0, len(content))
        tree, order = UPGMA_inc.UPGMA(distance_mat, M_labels)
        ##print(tree)
        tree = tree + ';'
        t_order = Tree(tree)
        order_list = []
        for node in t_order.traverse("postorder"):
            # Do some analysis on node
            if node.is_leaf():
                order_list.append(node.name)

        ###print(t_order)
        ###print(order_list)
        #min_x=distance_mat.index(min(distance_mat))
        #min_y=distance_mat[min_x].index(min(distance_mat[min_x]))

        ###print(min_x,min_y)
        ###print(distance_mat[min_x][min_y])
        t2 = Tree(content[int(order_list[0])])
        for i in range(0, len(order_list) - 1):

            t1 = Tree(content[int(order_list[i + 1])])
            tree1_copy = t1.copy()
            t2 = Tree(scm(t1, t2))

            #splits1=rf_dist_list.main(tree1_copy.copy(),tree2_copy.copy())
            ###print("splits 1:  ", splits1)
            ###print("splits 2:  ",splits2)
        ###print(t2.write(format=9))
        #t2.show()
    elif (arg2 == "uncommon"):
        leaf_lists = {}
        for i in range(len(content)):
            t1 = Tree(content[i])
            leaf_lists[i] = []
            for leaf in t1:
                leaf_lists[i].append(leaf.name)
        ###print(leaf_lists)
        distance_mat = []
        for x in range(0, len(content)):
            distance_mat.append([])
            for y in range(0, x):
                lev_dist = get_unique(leaf_lists[x], leaf_lists[y])
                distance_mat[x].append(lev_dist)
        #.pop(0)
        M_labels = number_labels(0, len(content))
        tree, order = UPGMA.UPGMA(distance_mat, M_labels)
        ###print(tree)
        tree = tree + ';'
        t_order = Tree(tree)
        order_list = []
        for node in t_order.traverse("postorder"):
            # Do some analysis on node
            if node.is_leaf():
                order_list.append(node.name)

        ###print(t_order)
        ##print(order_list)
        #min_x=distance_mat.index(min(distance_mat))
        #min_y=distance_mat[min_x].index(min(distance_mat[min_x]))

        ###print(min_x,min_y)
        ###print(distance_mat[min_x][min_y])
        t2 = Tree(content[int(order_list[0])])
        for i in range(0, len(order_list) - 1):

            t1 = Tree(content[int(order_list[i + 1])])
            tree1_copy = t1.copy()
            t2 = Tree(scm(t1, t2))

            leaf_list1 = []
            leaf_list2 = []
            for leaf in t1:
                leaf_list1.append(leaf.name)
            for leaf in t2:
                leaf_list2.append(leaf.name)
            ###print(leaf_list1)
            overlap = intersection(leaf_list1, leaf_list2)
            ###print("overlap is: ",overlap)

            tree2_copy = t2.copy()
            ###print(tree1_copy,tree2_copy)

            tree1_copy.prune(overlap)
            tree2_copy.prune(overlap)

            #t.write(format=1
            splits2 = rf_dist_list.main(tree2_copy.copy(), tree1_copy.copy())
            #splits1=rf_dist_list.main(tree1_copy.copy(),tree2_copy.copy())
            ###print("splits 1:  ", splits1)
            ###print("splits 2:  ",splits2)
    else:
        t2 = Tree(content[0])
        for i in range(0, len(content) - 1):

            t1 = Tree(content[i + 1])
            tree1_copy = t1.copy()
            t2 = Tree(scm(t1, t2))

            leaf_list1 = []
            leaf_list2 = []
            for leaf in t1:
                leaf_list1.append(leaf.name)
            for leaf in t2:
                leaf_list2.append(leaf.name)
            ###print(leaf_list1)
            #overlap=intersection(leaf_list1,leaf_list2)
            ###print("overlap is: ",overlap)

            #tree2_copy=t2.copy()
            ###print(tree1_copy,tree2_copy)

            #tree1_copy.prune(overlap)
            #tree2_copy.prune(overlap)

            #t.write(format=1
            #splits2=rf_dist_list.main(tree2_copy.copy(),tree1_copy.copy())
            #splits1=rf_dist_list.main(tree1_copy.copy(),tree2_copy.copy())
            ###print("splits 1:  ", splits1)
            ###print("splits 2:  ",splits2)
    ###print(time.time()-start_time)
    ###print(t2.write(format=9))
    #t2.show()
    return t2
예제 #10
0
        dist_map[(d1,d2)] = dist


# 输出similarity 到文件
file1 = "similarity.txt"
with open(file1,"w") as f:
    for row in score_matrix:
        f.write(','.join(str(e) for e in row))
        f.write('\n')
# 输出距离矩阵到distance 文件
file2 = "distance.txt"
with open(file2,"w") as f:
    for row in dist_matrix:
        f.write(','.join(str(e) for e in row))
        f.write('\n')

# # 打印距离map
# print(dist_map)
#
# 打印upgma树
tree = UPGMA.upgma(dist_map,len(sequences),len(sequences))
MSAseqs = Needleman.MSA(tree,sequences,score_matrix,s_matrix)

# again create msa_dist_map according the msaseqs
msa_dist_map = {}
for d1 in range(len(MSAseqs)):
    for d2 in range(len(MSAseqs)):
        dist = Needleman.distance(MSAseqs[d1], MSAseqs[d2], s_matrix)
        msa_dist_map[(d1,d2)] = dist

finaltree = UPGMA.upgma(msa_dist_map,len(MSAseqs),len(MSAseqs))