Esempio n. 1
0
    def get_tree(self,
                 chrom,
                 start=1,
                 end=None,
                 samples=None,
                 return_format="tree_obj"):

        print("chrom: {} start: {} end: {} samples: {}".format(
            chrom, start, end, samples))
        names, matrix = self.get_matrix(chrom,
                                        start=start,
                                        end=end,
                                        samples=samples,
                                        return_format="Phylo")
        distance_matrix = _DistanceMatrix(names, matrix)

        constructor = DistanceTreeConstructor()
        tree = constructor.nj(distance_matrix)  # neighbour joining tree

        if return_format == "tree_obj":
            return tree
        elif return_format == "newick":
            treeIO = StringIO()
            Phylo.write(tree, treeIO, "newick")
            treeString = treeIO.getvalue()
            treeString = treeString.strip()
            return treeString
Esempio n. 2
0
 def test_good_construction(self):
     dm = _DistanceMatrix(self.names, self.matrix)
     self.assertTrue(isinstance(dm, TreeConstruction._DistanceMatrix))
     self.assertEqual(dm.names[0], 'Alpha')
     self.assertEqual(dm.matrix[2][1], 3)
     self.assertEqual(len(dm), 4)
     self.assertEqual(repr(dm), "_DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]])")
Esempio n. 3
0
def construct_tree(matrix, nj=True):
    """Build a tree from a distance matrix

    Can either use neighbor-joining (nj) or UPGMA.
    """

    if not (matrix and type(matrix) == list and len(matrix) > 0):
        print "matrix has invalid value"
        return

    dm = _DistanceMatrix(names=[str(i) for i in range(len(matrix))],
                         matrix=matrix)

    constructor = DistanceTreeConstructor()
    if nj:
        tree = constructor.nj(dm)
    else:
        tree = constructor.upgma(dm)

    # this will remove the names from the inner nodes
    # this is critical for seq-gen to read in the tree
    for clade in tree.get_nonterminals():
        clade.name = ''

    return tree
 def test_good_construction(self):
     dm = _DistanceMatrix(self.names, self.matrix)
     self.assertTrue(isinstance(dm, TreeConstruction._DistanceMatrix))
     self.assertEqual(dm.names[0], 'Alpha')
     self.assertEqual(dm.matrix[2][1], 3)
     self.assertEqual(len(dm), 4)
     self.assertEqual(repr(dm), "_DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]])")
Esempio n. 5
0
def calcTree(ensemble, distance_matrix):
    """ Given a distance matrix for an ensemble, it creates an returns a tree structure.
    :arg ensemble: an ensemble with labels. 
    :type ensemble: prody.ensemble.Ensemble or prody.ensemble.PDBEnsemble
    :arg distance_matrix: a square matrix with length of ensemble. If numbers does not mismatch
    it will raise an error. 
    :type distance_matrix: numpy.ndarray 
    """
    try:
        from Bio import Phylo
    except ImportError:
        raise ImportError('Phylo module could not be imported. '
                          'Reinstall ProDy or install Biopython '
                          'to solve the problem.')

    names = ensemble.getLabels()
    if len(names) != distance_matrix.shape[0] or len(
            names) != distance_matrix.shape[1]:
        raise ValueError("The size of matrix and ensemble has a mismatch.")
        return None
    matrix = []
    k = 1
    for row in distance_matrix:
        matrix.append(list(row[:k]))
        k = k + 1
    from Bio.Phylo.TreeConstruction import _DistanceMatrix
    dm = _DistanceMatrix(names, matrix)
    constructor = Phylo.TreeConstruction.DistanceTreeConstructor()
    tree = constructor.nj(dm)
    for node in tree.get_nonterminals():
        node.name = None
    return tree
Esempio n. 6
0
    def distance_matrix(cls, cluster_list):
        print cluster_list
        dists = Distance.objects.filter(rep_accnum1__in=cluster_list, rep_accnum2__in=cluster_list)
        
        distance_pairs = {g.rep_accnum1 + '_' + g.rep_accnum2: g.distance for g in dists.all()}
    
        matrix = []
        for i in range(0,len(cluster_list)):
            matrix_iteration = []
            for j in range(0,i+1):
                if i == j:
                    matrix_iteration.append(0)
                elif cluster_list[i] + '_' + cluster_list[j] in distance_pairs:
                    matrix_iteration.append(distance_pairs[cluster_list[i] + '_' + cluster_list[j]])
                elif cluster_list[j] + '_' + cluster_list[i] in distance_pairs:
                    matrix_iteration.append(distance_pairs[cluster_list[j] + '_' + cluster_list[i]])
                else:
                    raise("Error, can't find pair!")
            matrix.append(matrix_iteration)
            #print matrix_iteration

        cluster_list = [s.encode('ascii', 'ignore') for s in cluster_list]
        matrix_obj = _DistanceMatrix(names=cluster_list, matrix=matrix)
        constructor = DistanceTreeConstructor()
        tree = constructor.nj(matrix_obj)
        tree.ladderize()
        #Phylo.draw_ascii(tree)
        output = StringIO.StringIO()
        Phylo.write(tree, output, 'newick')
        tree_str = output.getvalue()
        #print tree_str
        
        return tree_str
Esempio n. 7
0
def nj_wordlist(
        wordlist,
        column="Value",
        method=DistanceTreeConstructor.nj):
    """Create a tree using Hamming distances.

    From the CLDF Dataframe `wordlist`, create a tree using a distance
    method (neighbor joining, the default, or UPGMA) based on the
    Hamming distance (size of the symmetric difference) of
    presence/absence of the set of values in `column`.

    """
    wordlist = pandas.read_csv(wordlist, sep="\t")
    cogids = []
    languages = []
    for language, data in wordlist.groupby("Language_ID"):
        languages.append(language)
        cogids.append(set(data[column]))

    dm = _DistanceMatrix(languages, [
        [len(cogids[i] ^ cogids[j])
         for j in range(i + 1)]
        for i in range(len(cogids))])

    constructor = DistanceTreeConstructor()
    tree = method(constructor, dm)
    return tree
Esempio n. 8
0
 def test_good_manipulation(self):
     dm = _DistanceMatrix(self.names, self.matrix)
     # getitem
     self.assertEqual(dm[1], [1, 0, 3, 5])
     self.assertEqual(dm[2, 1], 3)
     self.assertEqual(dm[2][1], 3)
     self.assertEqual(dm[1, 2], 3)
     self.assertEqual(dm[1][2], 3)
     self.assertEqual(dm['Alpha'], [0, 1, 2, 4])
     self.assertEqual(dm['Gamma', 'Delta'], 6)
     # setitem
     dm['Alpha'] = [0, 10, 20, 40]
     self.assertEqual(dm['Alpha'], [0, 10, 20, 40])
     # delitem insert item
     del dm[1]
     self.assertEqual(dm.names, ['Alpha', 'Gamma', 'Delta'])
     self.assertEqual(dm.matrix, [[0], [20, 0], [40, 6, 0]])
     dm.insert('Beta', [1, 0, 3, 5], 1)
     self.assertEqual(dm.names, self.names)
     self.assertEqual(dm.matrix, [[0], [1, 0], [20, 3, 0], [40, 5, 6, 0]])
     del dm['Alpha']
     self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta'])
     self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0]])
     dm.insert('Alpha', [1, 2, 4, 0])
     self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta', 'Alpha'])
     self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0], [1, 2, 4, 0]])
Esempio n. 9
0
def measure_D_net(G,qmod,qcon):
    D_net_dic = {}
    D_net_ret = {}
    D_net = []
    for u in G: D_net_dic[u] = {}

    for u in sorted(G):
        key1 = "Taxon" + str(u)
        tmp_row = []
        for v in sorted(G):
            key2 = "Taxon" + str(v)
            if u < v: continue
            D_net_dic[u][v] = 1.0 - G.dmc_likelihood(u,v,qmod,qcon)
            tmp_row.append(D_net_dic[u][v])

            print D_net_dic[u][v],
        D_net.append(tmp_row)
        print '\n'


    names = []
    for u in G: names.append('Taxon'+str(u))
    print names 
    print D_net
    D_net_final = _DistanceMatrix(names,D_net)
    #print D_net_final.names 

    constructor = DistanceTreeConstructor()
    tree_dmc = constructor.upgma(D_net_final)
    #print tree_dmc
    Phylo.write(tree_dmc,'ph_dmc.nre','newick')
    
    return D_net_final
 def test_good_manipulation(self):
     dm = _DistanceMatrix(self.names, self.matrix)
     # getitem
     self.assertEqual(dm[1], [1, 0, 3, 5])
     self.assertEqual(dm[2, 1], 3)
     self.assertEqual(dm[2][1], 3)
     self.assertEqual(dm[1, 2], 3)
     self.assertEqual(dm[1][2], 3)
     self.assertEqual(dm['Alpha'], [0, 1, 2, 4])
     self.assertEqual(dm['Gamma', 'Delta'], 6)
     # setitem
     dm['Alpha'] = [0, 10, 20, 40]
     self.assertEqual(dm['Alpha'], [0, 10, 20, 40])
     # delitem insert item
     del dm[1]
     self.assertEqual(dm.names, ['Alpha', 'Gamma', 'Delta'])
     self.assertEqual(dm.matrix, [[0], [20, 0], [40, 6, 0]])
     dm.insert('Beta', [1, 0, 3, 5], 1)
     self.assertEqual(dm.names, self.names)
     self.assertEqual(dm.matrix, [[0], [1, 0], [20, 3, 0], [40, 5, 6, 0]])
     del dm['Alpha']
     self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta'])
     self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0]])
     dm.insert('Alpha', [1, 2, 4, 0])
     self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta', 'Alpha'])
     self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0], [1, 2, 4, 0]])
Esempio n. 11
0
def dm_to_tree(dm):
    dm = dm.astype(float)
    distance_triangular = [list(dm.values[i, : i + 1]) for i in range(len(dm))]
    try:
        dm = _DistanceMatrix(names=[str(i) for i in dm.columns], matrix=distance_triangular)
    except Exception, e:
        print list(dm.columns)
        print [type(i) for i in dm.columns]
Esempio n. 12
0
def construct_tree(X_2d, acc, title):
    acc = list(acc)
    data = pairwise_distances(X_2d).astype('float')
    data[np.isnan(data)] = 0
    data_list = []
    for i in range(data.shape[0]):
        #for j in range(i, data.shape[0]):
        data_list.append([data[i, j] for j in range(0, i+1)])
    data = data_list
    dm = _DistanceMatrix(acc, matrix=data)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    Phylo.write(tree, title + ".nwk", 'newick')
Esempio n. 13
0
def dm_to_tree(dm):
    dm = dm.astype(float)
    distance_triangular = [list(dm.values[i, :i + 1]) for i in range(len(dm))]
    try:
        dm = _DistanceMatrix(names=[str(i) for i in dm.columns],
                             matrix=distance_triangular)
    except Exception, e:
        print list(dm.columns)
        print[type(i) for i in dm.columns]
        print type(distance_triangular)
        print type(distance_triangular[0])
        print set([str(type(i)) for j in distance_triangular for i in j])
        print distance_triangular
        raise e
Esempio n. 14
0
def lower_tri(full_matrix):
    '''
    Take a symmetrical matrix, convert it to a lower triangle _Matrix object.
    '''
    lower_triangle = []
    names = []
    k = 2
    for i in full_matrix:
        lower_triangle.append(list(map(float, i[1:k])))
        names.append(i[0])
        k += 1
    from Bio.Phylo.TreeConstruction import _DistanceMatrix
    matrix = _DistanceMatrix(names, lower_triangle)
    return matrix
Esempio n. 15
0
def lower_tri(full_matrix):
    '''
    Take a symmetrical matrix, convert it to a lower triangle _Matrix object.
    '''
    lower_triangle = []
    names = []
    k = 2
    for i in full_matrix:
        lower_triangle.append(list(map(float, i[1:k])))
        names.append(i[0])
        k += 1
    from Bio.Phylo.TreeConstruction import _DistanceMatrix
    matrix = _DistanceMatrix(names, lower_triangle)
    return matrix
Esempio n. 16
0
def construct_tree(align, ssr_regions, motifs, weights=[1, 0.1]):
    """
    Construct an upgma tree based on a pairwise Levenshtein distance matrix.
    For each pairwise comparison, the Levenshtein distances are calculated
    for sequences of non-SSR and SSR regions separately, and the weighted
    sum of them are used as the distance to construct an upgma tree. By default,
    weights for non-SSR and SSR regions are 1 and 0.1, respectively. In SSR
    regions, one repeat difference is considered to be one edit distance.

    Parameters
    ----------
    align: Bio.AlignIO.MultipleSeqAlignment
        input sequence alignment
    ssr_regions: list of tuple
        start and end positions of SSR regions in the alignment
    motifs: list
        repeat motifs
    weights: list
        weights for non-SSR and SSR regions to culculate pairwise distances
        (default: [1, 0.1])
    """
    non_ssr_seqs = []
    ssr_seqs = []
    for a in align:
        seq = str(a.seq.upper())
        ssr_idx = np.array(list(chain(*[list(range(*x))
                                        for x in ssr_regions])))
        non_ssr_idx = list(set(range(len(seq))) - set(ssr_idx))
        seq_arr = np.array(list(seq))
        non_ssr_seq = "".join(seq_arr[non_ssr_idx])
        non_ssr_seqs.append(non_ssr_seq)

        ssr_seq = ""
        for rr, mot in zip(ssr_regions, motifs):
            ssr_seq += seq[rr[0]:rr[1]].replace("-", "").replace(mot, "x")
        ssr_seqs.append(ssr_seq)

    mat1 = pairwise_dist_Levenstein(non_ssr_seqs)
    mat2 = pairwise_dist_Levenstein(ssr_seqs)

    mat = [
        list(np.array(i) * weights[0] + np.array(j) * weights[1])
        for i, j in zip(mat1, mat2)
    ]
    names = ["seq{}".format(i) for i in range(len(align))]
    dmat = _DistanceMatrix(names, mat)

    constructor = DistanceTreeConstructor()
    return constructor.upgma(dmat)
Esempio n. 17
0
def create_distance_matrix(strainList, strainDict):
    print "Calculating distance matrix"
    matrix = []
    for i in range(1, len(strainList) + 1):
        matrix.append([0] * i)
    dm = _DistanceMatrix(strainList, matrix)
    for a in range(len(strainList)):
        for b in range(a, len(strainList)):
            strA = strainList[a]
            strB = strainList[b]
            genA = strainDict[strA]
            genB = strainDict[strB]
            dm[strA, strB] = calc_dist(len(genA), len(genB), len(genA & genB))
    print "Done calculating distance matrix"
    return dm
def create_distance_matrix(strainList, strainDict):
    print "Calculating distance matrix"
    matrix = []
    for i in range(1, len(strainList) + 1):
        matrix.append([0]*i)
    dm = _DistanceMatrix(strainList, matrix)
    for a in range(len(strainList)):
        for b in range(a, len(strainList)):
            strA = strainList[a]
            strB = strainList[b]
            genA = strainDict[strA]
            genB = strainDict[strB]
            dm[strA, strB] = calc_dist(len(genA), len(genB), len(genA & genB))
    print "Done calculating distance matrix"
    return dm
Esempio n. 19
0
def build_guide_trees(distance_matrix):
    # build distance matrix biopython object
    matrix = [distance_matrix[i, :i + 1].tolist() for i in range(len(distance_matrix))]
    names = ['S' + str(i) for i in range(len(distance_matrix))]
    dm = _DistanceMatrix(names, matrix)
    print('Constructed matrix')
    constructor = DistanceTreeConstructor()

    # construct neighbour joining tree
    t = time.time()
    tree = constructor.nj(dm)
    print('Constructed nj tree in {:.4f}s'.format(time.time() - t))
    Phylo.write(tree, "njtree.dnd", "newick")
    remove_inner_nodes_tree("njtree.dnd")

    """
Esempio n. 20
0
    def test_correct_res(self):

        dist_matrix = pd.read_csv("data/wiki_tree.csv", index_col=0)
        self.tree.set_distance_matrix(dist_matrix)
        self.tree.fit()

        dist_matrix = _DistanceMatrix(names=['a', 'b', 'c', 'd', 'e'],
                                      matrix=[[0], [5, 0], [9, 10, 0],
                                              [9, 10, 8, 0], [8, 9, 7, 3, 0]])
        constructor = DistanceTreeConstructor()
        lib_tree = constructor.nj(dist_matrix)

        self.assertTrue(
            is_isomorphic(
                Phylo.to_networkx(lib_tree).to_undirected(),
                Phylo.to_networkx(self.tree.get_tree()).to_undirected()))
Esempio n. 21
0
def main():
    ### Main Arg Parse ###
    parser = argparse.ArgumentParser(description="Fast tree builder v1")
    parser.add_argument('-i','--input',help="Input list of assembly paths")
    parser.add_argument('-t','--threads',help="Threads for mash)",default="1")
    args = vars(parser.parse_args())
    start_time = time.time()
    temp_dir = tempfile.mkdtemp()
    genome_list = args["input"] # List of assembly paths
    with open(genome_list) as f:
        input_data = f.read().strip().split("\n")
    threads = args["threads"]
    mash_matrix = make_mash_matrix(input_data,temp_dir,threads)
    # with open("test_out.txt","w") as f:
        # f.write(mash_matrix)
    i=2
    matrix = []
    names = []
    firstLine = True
    mash_matrix_lines = mash_matrix.split("\n")
    for line in mash_matrix_lines:
        if line.strip() != "":
            if firstLine:
                current_names = line.split("\t")
                for obj in current_names:
                     if len(obj) > 0:
                         names.append(obj)
                firstLine = False
            else:
                sub_matrix = []
                values = line.split("\t")
                for q in range(1,i):
                    val = float(values[q])
                    sub_matrix.append(val)
                matrix.append(sub_matrix)
                i+=1
    #print(names,len(names))
    #print(len(names),len(matrix))
    print("building tree")
    dm = _DistanceMatrix(names,matrix)
    constructor = DistanceTreeConstructor(method="nj")
    tree = constructor.nj(dm)
    unique_time = str(time.time()).split(".")[1]
    Phylo.write([tree],"my_tree_{}.tree".format(unique_time),"newick")
Esempio n. 22
0
def calcTree(names, distance_matrix, method='nj'):
    """ Given a distance matrix for an ensemble, it creates an returns a tree structure.

    :arg names: an list of names
    :type names: list, :class:`~numpy.ndarray`

    :arg distance_matrix: a square matrix with length of ensemble. If numbers does not match *names*
                          it will raise an error
    :type distance_matrix: :class:`~numpy.ndarray`
    """
    try:
        from Bio import Phylo
    except ImportError:
        raise ImportError('Phylo module could not be imported. '
                          'Reinstall ProDy or install Biopython '
                          'to solve the problem.')

    if len(names) != distance_matrix.shape[0] or len(
            names) != distance_matrix.shape[1]:
        raise ValueError("Mismatch between the sizes of matrix and names.")

    matrix = []
    k = 1
    for row in distance_matrix:
        matrix.append(list(row[:k]))
        k = k + 1
    from Bio.Phylo.TreeConstruction import _DistanceMatrix
    if isinstance(names, np.ndarray):
        names = names.tolist()
    dm = _DistanceMatrix(names, matrix)
    constructor = Phylo.TreeConstruction.DistanceTreeConstructor()

    method = method.strip().lower()
    if method == 'nj':
        tree = constructor.nj(dm)
    elif method == 'upgma':
        tree = constructor.upgma(dm)
    else:
        raise ValueError('Method can be only either "nj" or "upgma".')

    for node in tree.get_nonterminals():
        node.name = None
    return tree
Esempio n. 23
0
def bootstrap(ps,
              jobID,
              basename='majorityTree',
              treebuilder='nj',
              bootstraps=1,
              outgroup=None):
    """treebuilder could be nj/upgma, outgroup: a population name or 'midpoint'"""
    allLoci = set()
    for pop in ps.populations:
        allLoci = allLoci.union(pop.allpolySites)
    allLoci = list(
        allLoci
    )  ## sort it? Reduce to independent sites (www.pnas.org/content/93/23/13429, run LD?)

    sites = len(allLoci)
    trees = []
    for bootstrap in range(bootstraps):
        selectedLoci0 = np.random.choice(range(len(allLoci)),
                                         sites,
                                         replace=True)
        selectedLoci = [allLoci[l] for l in selectedLoci0]
        df = pd.DataFrame(
            [pop.bootstrap(selectedLoci) for pop in ps.populations],
            index=ps.popnames)
        dmNei = neiDF(df, [5] * (sites - 1))
        ## annoying conversion, BioPython couldnt be just more compatible with scipy/pdist?
        dmTriangular = [list(dmNei[i, :(i + 1)]) for i in range(len(dmNei))]
        try:
            m = _DistanceMatrix(ps.popnames, dmTriangular)
        except ValueError:
            pdb.set_trace()
        constructor = DistanceTreeConstructor(
        )  # could've passed treebuilder here too
        tree = getattr(constructor, treebuilder)(m)
        if outgroup == 'midpoint':
            tree.root_at_midpoint()
        elif not outgroup is None:
            tree.root_with_outgroup({'name': outgroup})
        trees.append(tree)
    filename = '../Data/%s_%s_%s_%s_%04d.pcl' % (
        basename, bootstraps, treebuilder, len(ps.populations), jobID)
    with open(filename, "wb") as w:
        pickle.dump(trees, w)
Esempio n. 24
0
    def bootstrap(self, afbased=True, basename='majorityTree', treebuilder='nj', bootstraps=1000, outgroup=None, useAllLoci=False):
        """treebuilder could be nj/upgma, outgroup: a population name or 'midpoint'"""
        ## allLoci: all loci that are variable in at least one population
        allpolySites, pwm = {True:  ['allpolySites', 'pwm'],
                             False: ['allpolySitesVCF', 'pwmVCF']}[afbased]
        allLoci = set()
        for pop in self.populations:
            allLoci = allLoci.union(getattr(pop, allpolySites))
        allLoci = list(allLoci) ## sort it? Reduce to independent sites (www.pnas.org/content/93/23/13429, run LD?)
        
        sites = len(allLoci)
        trees = []
        print ("Bootstrapping, rounds:", end=' ')
        for bootstrap in range(bootstraps): ## see also parallelized version
            print(bootstrap, end=' ')
            if useAllLoci:
                selectedLoci = allLoci
            else:
                selectedLoci0 = np.random.choice(range(len(allLoci)), sites, replace=True)
                selectedLoci = [allLoci[l] for l in selectedLoci0]
            df = pd.DataFrame([pop.bootstrap(selectedLoci, afbased) for pop in self.populations], index=self.popnames)
            #import pdb; pdb.set_trace()
            self.dmNei = neiDF(df, [5]*(sites-1))
            ## annoying conversion, BioPython couldnt be just more compatible with scipy/pdist?
            dmTriangular = [list(self.dmNei[i, :(i + 1)]) for i in range(len(self.dmNei))]

            m = _DistanceMatrix(self.popnames, dmTriangular)
            constructor = DistanceTreeConstructor() # could've passed treebuilder here too
            tree = getattr(constructor, treebuilder)(m)
            if outgroup == 'midpoint':
                tree.root_at_midpoint()
            elif not outgroup is None:
                tree.root_with_outgroup({'name': outgroup})
            trees.append(tree) ## use nj!
        ## debug info:
        print(f'selectedLoci: {selectedLoci[:30]}')
        ## see https://biopython.org/wiki/Phylo, turned out to be more suitable than dendropy/sumtrees
        self.majorityTree = Consensus.majority_consensus(trees) ## also consider strict_consensus and adam_consensus (but they don't have bootstrap support values)
        treefile = '%s/%s_%s_%s_%s.nwk' %(resultDir, basename, bootstraps,treebuilder, len(self.populations))
        Phylo.write(self.majorityTree, treefile, format='newick')
        print(f'wrote {treefile}')
        Phylo.draw_ascii(self.majorityTree)
Esempio n. 25
0
def build_phylo_tree(file_name_pair_dist):  #to return total branch lenght
    file_pair_dist = file_name_pair_dist  #all.reciprocal
    pair_dist = commands.getoutput("cut -f1,2,3 " + file_pair_dist +
                                   " ")  #need to delete header of the table
    pair_dist = pair_dist.split('\n')
    list_genome = commands.getoutput("awk '{print $1}{print $2}' " +
                                     file_pair_dist + " |sort -g|uniq")
    list_genome = list_genome.split('\n')

    #print 'Total genomes >> ',len(list_genome)
    Total_genomes = len(list_genome)
    #print 'Total pair distance >> ',(len(list_genome)*(len(list_genome)-1))/2
    Total_pair_distance = (len(list_genome) * (len(list_genome) - 1)) / 2
    check1 = False
    if len(pair_dist) == (
            len(list_genome) *
        (len(list_genome) - 1)) / 2:  #To check the number of pair distance
        #print 'Pass check 1 : total pair distances are correctly found '
        check1 = True

    matrix = []
    for n in range(1, len(list_genome) + 1):
        matrix.append([0] * n)
    #print matrix
    Total_metrix = len(matrix)
    #print 'Total matrix >>',len(matrix)

    Ds = _DistanceMatrix(list_genome, matrix)

    for pair in pair_dist:
        i = pair.split()
        #print i
        Ds[i[0], i[1]] = float(i[2])
    #print Ds
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(Ds)
    #print (tree) #for visualization
    #Phylo.draw(tree, branch_labels=lambda c: c.branch_length)

    print 'total_branch_length >> ', tree.total_branch_length()
    return Total_genomes, Total_pair_distance, check1, tree.total_branch_length(
    )
Esempio n. 26
0
 def test_bad_manipulation(self):
     dm = _DistanceMatrix(self.names, self.matrix)
     # getitem
     self.assertRaises(ValueError, dm.__getitem__, 'A')
     self.assertRaises(ValueError, dm.__getitem__, ('Alpha', 'A'))
     self.assertRaises(TypeError, dm.__getitem__, (1, 'A'))
     self.assertRaises(TypeError, dm.__getitem__, (1, 1.2))
     self.assertRaises(IndexError, dm.__getitem__, 6)
     self.assertRaises(IndexError, dm.__getitem__, (10, 10))
     # setitem: item or index test
     self.assertRaises(ValueError, dm.__setitem__, 'A', [1, 3, 4])
     self.assertRaises(ValueError, dm.__setitem__, ('Alpha', 'A'), 4)
     self.assertRaises(TypeError, dm.__setitem__, (1, 'A'), 3)
     self.assertRaises(TypeError, dm.__setitem__, (1, 1.2), 2)
     self.assertRaises(IndexError, dm.__setitem__, 6, [1, 3, 4])
     self.assertRaises(IndexError, dm.__setitem__, (10, 10), 1)
     # setitem: value test
     self.assertRaises(ValueError, dm.__setitem__, 0, [1, 2])
     self.assertRaises(TypeError, dm.__setitem__, ('Alpha', 'Beta'), 'a')
     self.assertRaises(TypeError, dm.__setitem__, 'Alpha', ['a', 'b', 'c'])
 def test_bad_manipulation(self):
     dm = _DistanceMatrix(self.names, self.matrix)
     #getitem
     self.assertRaises(ValueError, dm.__getitem__, 'A')
     self.assertRaises(ValueError, dm.__getitem__, ('Alpha', 'A'))
     self.assertRaises(TypeError, dm.__getitem__, (1, 'A'))
     self.assertRaises(TypeError, dm.__getitem__, (1, 1.2))
     self.assertRaises(IndexError, dm.__getitem__, 6)
     self.assertRaises(IndexError, dm.__getitem__, (10, 10))
     #setitem: item or index test
     self.assertRaises(ValueError, dm.__setitem__, 'A', [1, 3, 4])
     self.assertRaises(ValueError, dm.__setitem__, ('Alpha', 'A'), 4)
     self.assertRaises(TypeError, dm.__setitem__, (1, 'A'), 3)
     self.assertRaises(TypeError, dm.__setitem__, (1, 1.2), 2)
     self.assertRaises(IndexError, dm.__setitem__, 6, [1, 3, 4])
     self.assertRaises(IndexError, dm.__setitem__, (10, 10), 1)
     #setitem: value test
     self.assertRaises(ValueError, dm.__setitem__, 0, [1, 2])
     self.assertRaises(TypeError, dm.__setitem__, ('Alpha', 'Beta'), 'a')
     self.assertRaises(TypeError, dm.__setitem__, 'Alpha', ['a', 'b', 'c'])
Esempio n. 28
0
    def distance_matrix(cls, cluster_list):
        print cluster_list
        dists = Distance.objects.filter(rep_accnum1__in=cluster_list,
                                        rep_accnum2__in=cluster_list)

        distance_pairs = {
            g.rep_accnum1 + '_' + g.rep_accnum2: g.distance
            for g in dists.all()
        }

        matrix = []
        for i in range(0, len(cluster_list)):
            matrix_iteration = []
            for j in range(0, i + 1):
                if i == j:
                    matrix_iteration.append(0)
                elif cluster_list[i] + '_' + cluster_list[j] in distance_pairs:
                    matrix_iteration.append(
                        distance_pairs[cluster_list[i] + '_' +
                                       cluster_list[j]])
                elif cluster_list[j] + '_' + cluster_list[i] in distance_pairs:
                    matrix_iteration.append(
                        distance_pairs[cluster_list[j] + '_' +
                                       cluster_list[i]])
                else:
                    raise ("Error, can't find pair!")
            matrix.append(matrix_iteration)
            #print matrix_iteration

        cluster_list = [s.encode('ascii', 'ignore') for s in cluster_list]
        matrix_obj = _DistanceMatrix(names=cluster_list, matrix=matrix)
        constructor = DistanceTreeConstructor()
        tree = constructor.nj(matrix_obj)
        tree.ladderize()
        #Phylo.draw_ascii(tree)
        output = StringIO.StringIO()
        Phylo.write(tree, output, 'newick')
        tree_str = output.getvalue()
        #print tree_str

        return tree_str
Esempio n. 29
0
def get_local_tree(chrom,
                   start,
                   end,
                   vcf_fn,
                   samples=None,
                   outgroup=None,
                   plot=False):
    pwd = hap.get_pairwise_diff(vcf_fn,
                                chrom=chrom,
                                start=start,
                                end=end,
                                samples=samples,
                                chunksize=30000)
    distance_triangular = [
        list(pwd.values[i, :i + 1]) for i in range(len(pwd))
    ]
    dm = _DistanceMatrix(names=list(pwd.columns), matrix=distance_triangular)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    if outgroup is not None:
        tree.root_with_outgroup(outgroup)
    tree.ladderize()
    for t in tree.get_nonterminals():
        t.name = None
    tree_no = copy.deepcopy(tree)
    if outgroup is not None:
        tree_no.prune(outgroup)
    #tree_no.prune('AstTwe1')
    #tree_no.prune('SerRob1')
    if plot:
        fig = plt.figure(figsize=(15, 50))
        ax = plt.gca()

        Phylo.draw(tree_no, axes=ax, do_show=False)
        ax.spines['left'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.set_yticks([])
        ax.set_ylabel('')
    return pwd, tree
Esempio n. 30
0
def build_tree(dist_matrix, names_list, clust):

    tree = None
    if clust == 'nj':
        # print(dist_matrix)
        dm = DistanceMatrix(dist_matrix, names_list)
        tree_scikit = nj(dm,result_constructor=str)
        tree = Tree(tree_scikit)
    elif clust == 'upgma':
        dm = _DistanceMatrix(names=names_list, matrix=condense_matrix(dist_matrix))
        constructor = DistanceTreeConstructor()
        tree_biopython = constructor.upgma(dm)
        # remove InnerNode names
        for i in tree_biopython.get_nonterminals():
            i.name = None
        output = StringIO()
        Phylo.write(tree_biopython,output, "newick")
        tree = Tree(output.getvalue())
    else:
        print("Unknown tree clustering method ! Aborting")
        sys.exit()

    return tree
Esempio n. 31
0
def ParseMatrix(filename):
    mat_names = []  # FASTA headers
    mat_names_num = []
    lt_matrix = []  #lower triangular matrix
    with open(filename, 'rU') as MAT:
        for l in MAT:
            l = l.strip('\n')
            if len(l) == 0:
                continue
            elif l[0] == '>':
                mat_names.append(l[1:])
            else:
                lt_matrix.append([float(i) for i in l.split()])

    #Switch to integer headers and print Mapping file
    with open(args.out + '.map', 'w') as MAP:
        for index, name in enumerate(mat_names):
            MAP.write(str(index) + '\t' + name + '\n')
            mat_names_num.append(str(index))

    #Fill into Biopython distmat Data Structures
    dist_matrix = _DistanceMatrix(names=mat_names_num, matrix=lt_matrix)
    return dist_matrix
Esempio n. 32
0
def build_tree(dist_matrix, names_list, clust):

    tree = None
    if clust == 'nj':
        # print(dist_matrix)
        dm = DistanceMatrix(dist_matrix, names_list)
        tree_scikit = nj(dm, result_constructor=str)
        tree = Tree(tree_scikit)
    elif clust == 'upgma':
        dm = _DistanceMatrix(names=names_list,
                             matrix=condense_matrix(dist_matrix))
        constructor = DistanceTreeConstructor()
        tree_biopython = constructor.upgma(dm)
        # remove InnerNode names
        for i in tree_biopython.get_nonterminals():
            i.name = None
        output = StringIO()
        Phylo.write(tree_biopython, output, "newick")
        tree = Tree(output.getvalue())
    else:
        print("Unknown tree clustering method ! Aborting")
        sys.exit()

    return tree
Esempio n. 33
0
def D_F_matrix(D_Seq,D_net,final_tree):

    names_Seq = D_Seq.names
    names_Net = D_net.names
    D_F = []
    D_F_names = []

    for key1 in names_Net:
        i = names_Net.index(key1)
        #print key1
        temp_row = []
        for j in range(0,i+1):
            
            
            key2 = names_Net[j]
            #print key2,
            if key1 in names_Net and key2 in names_Seq:
                if not key1 in D_F_names:
                    D_F_names.append(key1)
                i1 = names_Net.index(key1)
                j2 = names_Net.index(key2)
                new_val = (0.5*D_net[key1,key2] + 0.5*D_Seq[key1,key2])
                #print new_val,
                temp_row.append(new_val)
        #print temp_row
        D_F.append(temp_row)

    print D_F 

    D_F_final = _DistanceMatrix(D_F_names,D_F)

    constructor = DistanceTreeConstructor()
    tree_D_F = constructor.upgma(D_F_final)
    #print tree_dmc
    Phylo.write(tree_D_F,final_tree,'newick')
    return D_F_final
Esempio n. 34
0
def D_F_matrix(D_Seq,D_net,final_tree, alpha):

    names_Seq = D_Seq.names
    names_Net = D_net.names
    D_F = []
    D_F_names = []

    for key1 in names_Net:
        i = names_Net.index(key1)
        #print key1
        temp_row = []
        for j in range(0,i+1):


            key2 = names_Net[j]
            #print key2,
            if key1 in names_Net and key2 in names_Seq:
                if not key1 in D_F_names:
                    D_F_names.append(key1)
                i1 = names_Net.index(key1)
                j2 = names_Net.index(key2)                              # should be 1-alpha * D_net and alpha * D_seq
                new_val = ((1-alpha) * D_net[key1,key2]) + (alpha * D_Seq[key1,key2])  # alpha can be set to any value (between 0 and 1)
                #print new_val,                                          # we can change alpha to choose how much of D_Seq and D_net we want to use
                temp_row.append(new_val)
        #print temp_row
        D_F.append(temp_row)

    print D_F

    D_F_final = _DistanceMatrix(D_F_names,D_F)

    constructor = DistanceTreeConstructor()
    tree_D_F = constructor.upgma(D_F_final)
    #print tree_dmc
    Phylo.write(tree_D_F,final_tree,'newick')
    return D_F_final
Esempio n. 35
0
def build_tree(languages, lang_dist):
    '''
    Builds a tree and prints it to a specified location. 
    '''
    print """
    Where should the Tree be Printed:
    (1) Console
    (2) Text File
    (3) Both
    """
    user_in = input()

    #Build and print the tree
    if user_in > 0 and user_in < 4:
        #decode the strings in languages
        for i in range(len(languages)):
            languages[i] = codecs.encode(languages[i], 'utf-8')

        #get the lower triangle matrix format
        for i in range(len(lang_dist)):
            lang_dist[i] = lang_dist[i][:i + 1]

        dist_matrix = _DistanceMatrix(languages, lang_dist)

        tree_constructor = DistanceTreeConstructor()
        upgma_tree = tree_constructor.upgma(dist_matrix)
        neighbor_tree = tree_constructor.nj(dist_matrix)

        if not upgma_tree is None and not neighbor_tree is None:
            #Draw to the console
            if user_in == 1:
                print "upgma tree:\n"
                Phylo.draw_ascii(upgma_tree)
                Phylo.draw(upgma_tree)
                print "\nneighbor joining tree:\n"
                Phylo.draw_ascii(neighbor_tree)

            #draw to the files only
            elif user_in == 2:
                with open(r"reports/language_distances/upgma_tree.txt",
                          'w') as f:
                    #                     f.write(str(upgma_tree))
                    Phylo.draw_ascii(upgma_tree, f)
                with open(r"reports/language_distances/neighbor_tree.txt",
                          'w') as f:
                    #                     f.write(str(neighbor_tree))
                    Phylo.draw_ascii(neighbor_tree, f)

            #draw to the files and the console
            elif user_in == 3:
                print "upgma tree:\n"
                Phylo.draw_ascii(upgma_tree)
                print "\nneighbor joining tree:\n"
                Phylo.draw_ascii(neighbor_tree)

                with open(r"reports/language_distances/upgma_tree.txt",
                          'w') as f:
                    #                     f.write(str(upgma_tree))
                    Phylo.draw_ascii(upgma_tree, f)
                with open(r"reports/language_distances/neighbor_tree.txt",
                          'w') as f:
                    #                     f.write(str(neighbor_tree))
                    Phylo.draw_ascii(neighbor_tree, f)

        else:
            print "Run a comparison to generate the distance matrix first\n"

    else:
        print "That is not a valid option. Please choose a valid option\n"
Esempio n. 36
0
from Bio import Phylo
from Bio.Phylo.TreeConstruction import _DistanceMatrix
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from io import StringIO
import re

# hamming distance
def hamming(seq1, seq2):
    # assert len(seq1) == len(seq2), 'unequal reads!'
    return int(sum([i[0] != i[1] for i in zip(seq1, seq2)]))

f = open('rosalind_chbp.txt')
species = f.readline().rstrip().split()
table = [''.join(i) for i in zip(*f.read().rstrip().split())]
n = len(table)

'''
For the Phylo.TreeConstruction to work, integers in the distance matrix
must be Python int and not numpy.int64
'''
dm = [[hamming(table[i], table[j]) for j in range(i+1)] for i in range(n)]
constructor = DistanceTreeConstructor()
tree = constructor.nj(_DistanceMatrix(names=species, matrix=dm))

handle = StringIO()
Phylo.write(tree, handle, format='newick', plain=True)
result = handle.getvalue()
result = re.sub('Inner[0-9]+', '', result)
open('rosalind_chbp_sub.txt', 'wt').write(result)
        l += 1
        
dist = {}      
with open(blasttable) as b:
      for ln in b.read().splitlines():
        s = ln.split("\t")
        if s[0] not in dist:
            dist[s[0]] = {}
        dist[s[0]][s[1]] = 1-float(s[2])

## Distance matrix
dist_score_assembly_line = generate_distance_matrix(pathways, domain_names, annotation,
                                                    dist, Jaccardw, GKw, DDSw, AIw,
                                                    scale=1, nbhood=3, outfile=outfile)

#-- Plot the tree
names = pathways.keys()
score = [s for s in open(outfile, 'r').read().split('\n')[1:] if s != '']
matrix = []
for i in range(len(score)):
    input_i = score[i].split(',')[1:(i+2)]
    input_i_int = [float(n) for n in input_i]
    matrix.append(input_i_int)
m = _DistanceMatrix(names, matrix)
constructor = DistanceTreeConstructor()
tree1 = constructor.upgma(m)
#Bio.Phylo.draw_ascii(tree1)
Bio.Phylo.write(tree1, tree_outfile, 'newick')


Esempio n. 38
0
            if((i-1)==j):
                row.append(0)
            else:
                row.append(np.linalg.norm(scaled_ps[j]-scaled_ps[i-1]))
        dist_mat.append(row)
    return dist_mat

if __name__ == "__main__":
    names = []
    sequences = []
    with open("kmercoded.fasta", "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            seq = record.seq
            N = len(seq)
            c = Counter(seq)
            if(c['$']==N):
                continue
            names.append(record.description)
            sequences.append(record.seq)
        pspec = power_spectrum(sequences)
        scaled_ps = linear_scaling(pspec) #cubic_scaling(pspec)
        dist_mat = get_euclidean_distance(scaled_ps)
        constructor = DistanceTreeConstructor()
        distance_matrix_10 = _DistanceMatrix(names=names[0:10], matrix=dist_arr[0:10])
        tree_upgma_10 = constructor.upgma(distance_matrix_10)
        Phylo.draw(tree_upgma_10)
        tree_nj_10 = constructor.nj(distance_matrix_10)
        Phylo.draw(tree_nj_10)
    

Esempio n. 39
0
def compute_tree(options, mat, names):
    """ make upgma hierarchical clustering and write it as png and
    graphviz dot
    """
    # oops, convert to biopython matrix
    matrix = []
    for i in xrange(len(names)):
        row = []
        for j in xrange(i + 1):
            # tree constructor writes 0-distances as 1s for some reason
            # so we hack around here
            val = float(mat[names[i]][names[j]])
            if val == 0.:
                val = 1e-10
            elif val == 1.:
                val = 1.1
            row.append(val)
        matrix.append(row)
    dm = _DistanceMatrix(names, matrix)

    # upgma tree
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dm)
    robust_makedirs(os.path.dirname(tree_path(options)))
    Phylo.write(tree, tree_path(options), "newick")

    # png tree -- note : doesn't work in toil
    def f(x):
        if "Inner" in str(x):
            return ""
        else:
            return x
    Phylo.draw_graphviz(tree, label_func = f, node_size=1000, node_shape="s", font_size=10)
    pylab.savefig(tree_path(options).replace("newick", "png"))

    # graphviz
    # get networkx graph
    nxgraph = Phylo.to_networkx(tree)
    # make undirected
    nxgraph = nx.Graph(nxgraph)
    # push names to name labels
    nxgraph = nx.convert_node_labels_to_integers(nxgraph, label_attribute="label")
    for node_id in nxgraph.nodes():
        node = nxgraph.node[node_id]
        if "Inner" in str(node["label"]):
            node["label"] = "\"\""
            node["width"] = 0.001
            node["height"] = 0.001
        else:
            node["fontsize"] = 18
    for edge_id in nxgraph.edges():
        edge = nxgraph.edge[edge_id[0]][edge_id[1]]
        # in graphviz, weight means something else, so make it a label
        weight = float(edge["weight"])
        # undo hack from above
        if weight > 1:
            weight = 1.
        if weight <= 1e-10 or weight == 1.:
            weight = 0.
        edge["weight"] = None
        edge["label"] = "{0:.3g}".format(float(weight) * 100.)
        edge["fontsize"] = 14
        edge["len"] = draw_len(weight)
    nx.write_dot(nxgraph, tree_path(options).replace("newick", "dot"))
Esempio n. 40
0
def generate_tree_dendro(G_ref,H,outfile):
    length = len(H)
    length = length - 2
    '''

    tree = dendropy.Tree(stream=StringIO.StringIO(str(H[length])),schema="newick")
    for i in range(length-1,-1,-1):
        (u,v) = H[i]
        temp_tree = dendropy.Tree(stream=StringIO.StringIO(str(H[i])),schema="newick")
        current_parent = filter(lambda x: x.taxon.label == str(u), [y for y in tree.leaf_nodes()])
        temp_tree_nodes = [x for x in temp_tree.nodes()]
        current_parent[0].add_child(temp_tree_nodes[1])
        current_parent[0].add_child(temp_tree_nodes[2])


    print(tree.as_ascii_plot())
    '''
    (u,v) = H[length]
    counter = 1
    #[999]((Taxon1:0.3,Taxon2:0.14):0.5,(Taxon3:0.34, Taxon4:0.5):0.12);
    string_root = "(" + "Taxon" + str(u) + ":" + str(1) + "," + "Taxon" + str(v) + ":" + str(1) + ")"
    tree = dendropy.Tree(stream=StringIO.StringIO(string_root),schema="newick")
    for i in range(length-1,-1,-1):
        (u,v) = H[i]
        string_temp = "(" + "Taxon" + str(u) + ":" + str(1) + "," + "Taxon" + str(v) + ":" + str(1) + ")"
        temp_tree = dendropy.Tree(stream=StringIO.StringIO(string_temp),schema="newick")
        current_parent = filter(lambda x: x.taxon.label == "Taxon" + str(u), [y for y in tree.leaf_nodes()])
        temp_tree_nodes = [x for x in temp_tree.nodes()]
        current_parent[0].add_child(temp_tree_nodes[1])
        current_parent[0].add_child(temp_tree_nodes[2])
        current_parent[0].taxon.label = current_parent[0].oid

    #print tree 

    out = tree.as_string('newick')
    pdm = treecalc.PatristicDistanceMatrix(tree)
    T = [t1 for i, t1 in enumerate(tree.taxon_set)]
    T.sort(key=lambda x: x.label)

    out =  out.replace('[&U]','[50]') ## 50 unit long sequences
    open(outfile,'w').write(out)    # The first T produced by the history
    print(tree.as_ascii_plot())

    print "New tree data structure"

    D_net_dic = {}
    D_net_ret = {}
    D_net = []
    for u in G_ref: D_net_dic[u] = {}

    for u in sorted(G_ref):
        print "size"
        key1 = "Taxon" + str(u)
        tmp_row = []
        for v in sorted(G_ref):
            key2 = "Taxon" + str(v)
            if u < v: continue
            D_net_dic[u][v] = 1.0 / (pdm(T[int(u)-1],T[int(v)-1])+1)
            tmp_row.append(D_net_dic[u][v])

            print D_net_dic[u][v],
        D_net.append(tmp_row)
        print '\n'

    names = []
    for u in G_ref: names.append('Taxon'+str(u))
    print names 
    print D_net
    D_net_final = _DistanceMatrix(names,D_net)

    return D_net_final
Esempio n. 41
0
def dmc_delorean_plain(G,G_ref,qmod,qcon,outfile):
    """ Reconstructs the network using the dmc model and delorean algorithm. """

    # Make initial pairwise likelihoods.
    #target = open(hisfile,'w')

    

    global H1 
    L = {}
    for u in G: L[u] = {}

    for u in G:
        for v in G:
            if u >= v: continue
            L[u][v] = G.dmc_likelihood(u,v,qmod,qcon)

    level_counter = 0

    while (G.num_nodes >= 2): # at least two nodes in the graph.

        # Get largest Luv.
        L_list = []
        L_prob = -10000000000

        for u in G:
            for v in G:
                if u >= v: continue

                Luv = L[u][v]
                if Luv > L_prob:
                    L_list = [(u,v)]
                    L_prob = Luv
                elif Luv == L_prob:
                    L_list.append((u,v))

        # Choose random pair; assign random daddy.
        pair = random.choice(L_list)
        (u,v) = (pair[0],pair[1]) if random.random() > 0.5 else (pair[1],pair[0])

        # Nodes whose likelihood values need to be computed.
        altered = (G.neighbors(u) | G.neighbors(v) | set([u])) - set([v])

        # Prepare to delete v: add new edges in symmetric difference of v to u.
        for neighbor in G.neighbors(v):
            if u == neighbor: continue # Don't add self-edge.
            elif v == neighbor: continue # Don't add, will remove v anyways.
            elif G.has_edge(u,neighbor): continue # Edge already exists.
            else: G.add_edge(u,neighbor)
        G.remove_node(v)

        H1.append((u,v)) 
        print "%s\t%s" %(u,v)

        # Fix up altered Luv values.
        for x in altered:
            for y in G:
                if x == y: continue
                L[min(x,y)][max(x,y)] = G.dmc_likelihood(x,y,qmod,qcon)

    last_node = G.nodes()[0]
    H1.append((last_node,last_node))
    print "%s\t%s" %(last_node,last_node)


    length = len(H1)
    length = length - 2
    '''

    tree = dendropy.Tree(stream=StringIO.StringIO(str(H[length])),schema="newick")
    for i in range(length-1,-1,-1):
        (u,v) = H[i]
        temp_tree = dendropy.Tree(stream=StringIO.StringIO(str(H[i])),schema="newick")
        current_parent = filter(lambda x: x.taxon.label == str(u), [y for y in tree.leaf_nodes()])
        temp_tree_nodes = [x for x in temp_tree.nodes()]
        current_parent[0].add_child(temp_tree_nodes[1])
        current_parent[0].add_child(temp_tree_nodes[2])


    print(tree.as_ascii_plot())
    '''
    (u,v) = H1[length]
    counter = 1
    #[999]((Taxon1:0.3,Taxon2:0.14):0.5,(Taxon3:0.34, Taxon4:0.5):0.12);
    string_root = "(" + "Taxon" + str(u) + ":" + str(1) + "," + "Taxon" + str(v) + ":" + str(1) + ")"
    tree = dendropy.Tree(stream=StringIO.StringIO(string_root),schema="newick")
    for i in range(length-1,-1,-1):
        (u,v) = H1[i]
        string_temp = "(" + "Taxon" + str(u) + ":" + str(1) + "," + "Taxon" + str(v) + ":" + str(1) + ")"
        temp_tree = dendropy.Tree(stream=StringIO.StringIO(string_temp),schema="newick")
        current_parent = filter(lambda x: x.taxon.label == "Taxon" + str(u), [y for y in tree.leaf_nodes()])
        temp_tree_nodes = [x for x in temp_tree.nodes()]
        current_parent[0].add_child(temp_tree_nodes[1])
        current_parent[0].add_child(temp_tree_nodes[2])
        current_parent[0].taxon.label = current_parent[0].oid

    #print tree 

    out = tree.as_string('newick')
    pdm = treecalc.PatristicDistanceMatrix(tree)
    T = [t1 for i, t1 in enumerate(tree.taxon_set)]
    T.sort(key=lambda x: x.label)
    #t = Tree(out.replace('[&U]',''))
    out =  out.replace('[&U]','[50]') ## 50 unit long sequences

    open(outfile,'w').write(out)    # The first T produced by the history
    print(tree.as_ascii_plot())
    
    print "New tree data structure"

    D_net_dic = {}
    D_net_ret = {}
    D_net = []
    for u in G_ref: D_net_dic[u] = {}

    for u in sorted(G_ref):
        print "size"
        key1 = "Taxon" + str(u)
        tmp_row = []
        for v in sorted(G_ref):
            key2 = "Taxon" + str(v)
            if u < v: continue
            D_net_dic[u][v] = 1.0 / (pdm(T[int(u)-1],T[int(v)-1])+1)
            tmp_row.append(D_net_dic[u][v])

            print D_net_dic[u][v],
        D_net.append(tmp_row)
        print '\n'

    names = []
    for u in G_ref: names.append('Taxon'+str(u))
    print names 
    print D_net
    D_net_final = _DistanceMatrix(names,D_net)

    return D_net_final
Esempio n. 42
0
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x):
	
	#Make sure names are unique
	names = org_names
	for name in names:
		if names.count(name)>1:
			temp_name = name
			i=1
			for dummy in range(0,names.count(name)-1): #Don't change the last one, just to make sure we don't conflict with the outgroup
				names[names.index(temp_name)] = temp_name + "_" + str(i)
				i = i +1
		
	#Normalize the x vector
	x = map(lambda y: y/sum(x),x)
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])

	#Make the list of distances (ave of the two ckm matrices)
	ckm_ave_train = .5*ckm30_norm+.5*ckm50_norm
	ckm_ave_train_dist = dict()
	for i in range(len(org_names)):
		ckm_ave_train_dist[org_names[i]] = [.5*ckm_ave_train[i,j]+.5*ckm_ave_train[j,i] for j in range(len(org_names))]

	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)

	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length in PlotPackage.py")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)

	#Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else)
	def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names):
		dists = map(lambda y: abs(y-percent), ckm_ave_train_dist[leaf_name])
		nearby_indicies = list()
		#Add all the organisms that are within 0.05 of the given percent
	#	for i in range(len(dists)):
	#		if dists[i]<=.05:
	#			nearby_indicies.append(i)
		nearby_names = list()
		#If there are no nearby indicies, add the closest organism to the given percent
		if nearby_indicies==[]:
			nearby_names.append(org_names[dists.index(min(dists))])
		else:
			for i in range(len(nearby_indicies)):
				nearby_names.append(org_names[i])
		mean_dist = np.mean(map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],nearby_names))
		nearby_names.append(leaf_name)
		LCA = t.get_common_ancestor(nearby_names)
		LCA_to_leaf_dist = t.get_distance(LCA,leaf_name)
		#divide the dist to the right/left of the LCA node by the number of percentage points in there
		if LCA.name==t.name:
			percent_dist = percent*LCA_to_leaf_dist
			if mean_dist <= percent:
				child_node = (t&leaf_name)
			else:
				child_node = (t&nearby_names[0])#This means "go up from root" in the direction of the nearest guy
			ancestor_node = (t&child_node.name).up
		elif mean_dist <= percent:
			percent_dist = t.get_distance(LCA) + abs(percent-mean_dist)*(LCA_to_leaf_dist)/(1-mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		else:
			percent_dist = t.get_distance(LCA) - abs(percent-mean_dist)*(t.get_distance(LCA))/(mean_dist)
			child_node = (t&leaf_name)
			ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))

	#Set outgroup
	if outgroup in names:
		t.set_outgroup(t&outgroup) #I will need to check that this outgroup is actually one of the names...
	else:
		print("WARNING: the chosen outgroup " + outgroup + " is not in the given taxonomy: ")
		print(names)
		print("Proceeding without setting an outgroup. This may cause results to be uninterpretable.")

	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = [-.5141*(val**3)+1.0932*(val**2)+0.3824*val for val in cutoffs]
	for i in range(len(org_names)):
		xi = x[i:len(x):len(org_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, org_names[i], cutoffs[j-1],ckm_ave_train_dist, org_names)
				hyp_node_names[org_names[i]+"_"+str(cutoffs[j-1])] = [org_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names

	size_factor=250
	font_size=55

	#Now put the bubbles on the nodes
	def layout(node):
		node_style = NodeStyle()
		node_style["hz_line_width"] = 10
		node_style["vt_line_width"] = 10
		node.set_style(node_style)
		#print(node)
		if node.is_leaf():
			if node.name in org_names:
				#make reconstructed bubble
				size = x[org_names.index(node.name)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#Denote that this was a training organism
				nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in org_names:
				idx = hyp_node_names[node.name][2]
				size = x[org_names.index(node_base_name)+(idx+1)*len(org_names)]
				F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#This is if I want the names of the hypothetical nodes to be printed as well
				#nameFace = AttrFace("name", fsize=font_size, fgcolor='black')
				#faces.add_face_to_node(nameFace, node, 0, position="branch-right")
			else:
				size=0
		else:
			size=0
	
	ts = TreeStyle()
	ts.layout_fn = layout
	ts.mode = "r"
	#ts.mode = "c"
	ts.scale = 2*1000
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50
	F = CircleFace(radius=.87*size_factor, color="RoyalBlue", style="sphere")
	F.border.width = None
	F.opacity = 0.6
	ts.legend.add_face(F,0)
	ts.legend.add_face(TextFace("  Inferred relative abundance",fsize=1.5*font_size,fgcolor="Blue"),1)
	ts.legend.add_face(TextFace("  Total absolute abundance depicted " + str(sum_x)[0:8], fsize=1.5*font_size,fgcolor="Black"),1)
	ts.legend_position=4
	#t.show(tree_style=ts)
	t.render(outfile, w=550, units="mm", tree_style=ts)
	
	#Redner the XML file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	project.add_phylogeny(phylo)
	project.export(open(outfilexml,'w'))
Esempio n. 43
0
# rosalind_ba7b
'''
Limb Length Problem

Find the limb length for a leaf in a tree.

Given: An integer n, followed by an integer j between 0 and n - 1, 
followed by a space-separated additive distance matrix D (whose elements are integers).

Return: The limb length of the leaf in Tree(D) corresponding to row j of this 
distance matrix (use 0-based indexing).

'''
import numpy as np
from Bio.Phylo.TreeConstruction import _DistanceMatrix
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

f = open('rosalind_ba7b.txt')
n = int(f.readline().rstrip())
j = int(f.readline().rstrip())

D = np.fromfile(f, sep=' ', dtype=int).reshape(n, n)

#For the Phylo.TreeConstruction to work, integers must be Python int and not numpy.int64
dm = [[int(D[i, j]) for j in range(i+1)] for i in range(n)]
names = [str(i) for i in range(n)]

constructor = DistanceTreeConstructor()
tree = constructor.nj(_DistanceMatrix(names, dm))

print(round(tree.find_any(str(j)).branch_length))
Esempio n. 44
0
t_ = open('taxas.txt')
taxas_ = []

for line in t_:
    taxas_.append((line.rsplit()[0]))

t_.close()

dist_ = np.genfromtxt('distance_.log')
dist1_ = np.tril(dist_)
# Make list of lists

dist2_ = []
for x in range(0, np.shape(dist1_)[0]):
    dist2_.append(list(dist1_[x][0:x + 1]))
mm_ = _DistanceMatrix(taxas_, dist2_)
root_ = DistanceTreeConstructor()
tree = root_.nj(mm_)

path_ = os.path.dirname(os.path.realpath(__file__))
path_break = path_.split('/')
dir_ = path_break[len(path_break) - 1]
Phylo.write(tree, str(dir_) + '.nex', 'newick')
############################################################################################################################
g_ = open('final.nex.tree', 'a')
with open(dir_ + '.p1.nex', 'r') as f_:
    for line in f_:
        g_.write(line)
g_.write('\nBEGIN TREES;\nTREE tree1 =')
with open(dir_ + '.nex', 'r') as f_:
    for line in f_:
Esempio n. 45
0
def construct_distance_matrix(names, bit_vectors):
	return _DistanceMatrix(names, get_similarity_from_bit_vectors(bit_vectors))
Esempio n. 46
0
def block_jsd(genome_ffp_vector, output_file):
    self_txt_result = output_file
    with open(genome_ffp_vector, "r") as self_genome_ffp_vector:
        genomes = collections.defaultdict(list)
        for lines in self_genome_ffp_vector:
            line = None
            line = None
            line = lines.strip().split()
            genome_block_name = line[0]
            genome_name = genome_block_name.split(".")[1].split("-")[0]
            #transfrom list of strings to floats
            block_ffp_vector = [float(i) for i in line[1:]]
            genomes[genome_name].append(block_ffp_vector)
        #matrix for final jsd distance, it should have the row&col length of total number of genomes
        final_jsd_matrix = np.zeros(shape=(len(genomes.keys()),
                                           len(genomes.keys())))
        #now give index to each genome position
        index_position = dict()
        for index, genome in enumerate(genomes.keys()):
            index_position[genome] = index
        #Start pairwise comparison by combination of 2 genomes
        for genome_pairs in combinations(genomes.keys(), 2):
            genome_pair1 = None
            genome_pair2 = None
            pairwise_block_comparison = None
            pairwise_min_jsds = None
            Genome_A = None
            Genome_B = None
            pairwise_block_comparison = collections.defaultdict(list)
            pairwise_min_jsds = collections.defaultdict(list)
            genome_pair1 = genome_pairs[0]
            genome_pair2 = genome_pairs[1]
            #Compare every block of genome_pair1 to every block of genome_pair2
            block_A = 0
            for genome_pair1_vector in genomes[genome_pair1]:
                block_A += 1
                for genome_pair2_vector in genomes[genome_pair2]:
                    pairwise_block_comparison[
                        "%s_%i-%s" %
                        (genome_pair1, block_A, genome_pair2)].append(
                            JSD(np.array(genome_pair1_vector),
                                np.array(genome_pair2_vector)))
            #Compare every block of genome_pair2 to every block of genome_pair1
            block_B = 0
            for genome_pair2_vector in genomes[genome_pair2]:
                block_B += 1
                for genome_pair1_vector in genomes[genome_pair1]:
                    pairwise_block_comparison[
                        "%s_%i-%s" %
                        (genome_pair2, block_B, genome_pair1)].append(
                            JSD(np.array(genome_pair2_vector),
                                np.array(genome_pair1_vector)))
            #Select and save minumum values from each block pairwise comparison
            for pairwise_block in pairwise_block_comparison:
                if genome_pair1 == pairwise_block.split("-")[0].split("_")[
                        0] and genome_pair2 == pairwise_block.split("-")[1]:
                    pairwise_min_jsds[
                        "%s_%s" % (genome_pair1, genome_pair2)].append(
                            min(pairwise_block_comparison[pairwise_block]))
                if genome_pair2 == pairwise_block.split("-")[0].split("_")[
                        0] and genome_pair1 == pairwise_block.split("-")[1]:
                    pairwise_min_jsds[
                        "%s_%s" % (genome_pair2, genome_pair1)].append(
                            min(pairwise_block_comparison[pairwise_block]))
            #Now at this point pairwise_min_jsds.keys() should be just 2
            if len(pairwise_min_jsds.keys()) == 2:
                Genome_A = pairwise_min_jsds["%s_%s" %
                                             (genome_pair1, genome_pair2)]
                Genome_B = pairwise_min_jsds["%s_%s" %
                                             (genome_pair2, genome_pair1)]
                final_jsd = ((sum(Genome_A) / len(Genome_A)) +
                             (sum(Genome_B) / len(Genome_B))) / 2
                #insert this value in the final matrix
                final_jsd_matrix[index_position[genome_pair1],
                                 index_position[genome_pair2]] = final_jsd
                final_jsd_matrix[index_position[genome_pair2],
                                 index_position[genome_pair1]] = final_jsd
            #Write final results
            np.savetxt(self_txt_result,
                       final_jsd_matrix,
                       fmt="%.18e",
                       delimiter="\t",
                       newline="\n",
                       header="\t".join(genomes.keys()),
                       footer="",
                       comments="")

        #convert np matrix to lower triangular matrix and then to list of lists
        names = genomes.keys()
        new_jsd_matrix = []
        loop_count = 0
        for i in np.tril(final_jsd_matrix):
            loop_count += 1
            new_jsd_matrix.append(i.tolist()[:loop_count])

        jsd_distance_lowertriange = _DistanceMatrix(names, new_jsd_matrix)

        #construct nj phylogenetic tree
        constructor = DistanceTreeConstructor()
        nj_tree = constructor.nj(jsd_distance_lowertriange)
        Phylo.draw_ascii(nj_tree)
        #and write tree in newick formart
        Phylo.write(nj_tree, 'nj_ffp_jsd_tree.newick', "newick")
Esempio n. 47
0
def main():
    ### Main Arg Parse ###
    parser = argparse.ArgumentParser(
        description="Automated Phylogeny Builder v1")
    parser.add_argument(
        '-d',
        '--indir',
        help="Input Directory: Directory of FASTA files to analyze")
    parser.add_argument('-o', '--out', help="Output Directory", required=True)
    parser.add_argument('-t',
                        '--threads',
                        help="Number of max threads to use (default=1)",
                        default="1")
    parser.add_argument(
        '-b',
        '--mash_db',
        help="Provide prebuilt mash DB, otherwise build from scratch")
    parser.add_argument(
        '-f',
        '--fast',
        help="Fast option for distance based neighbor joining tree",
        action="store_true")
    parser.add_argument(
        '-m',
        '--max_num',
        help="Maximum number of isolates to include (default=50)",
        default="50")
    parser.add_argument(
        '-g',
        '--genomes',
        help=
        "Provide genome directory to build tree with instead of automatically picking, requires -r flag"
    )
    parser.add_argument(
        '-r',
        '--reference',
        help=
        "Required with -g flag; provide reference to use for phylogeny when providing genome directory"
    )
    parser.add_argument('-s',
                        '--snippy',
                        help="existing snippy dir, requires -g and -r")
    parser.add_argument(
        '-p',
        '--proj_name',
        help=
        "project prefix - will be used to label all files associated with project",
        required=True)
    args = vars(parser.parse_args())
    start_time = time.time()

    ### Print Args ###
    print("Running with the following parameters:")
    for arg in args:
        print(arg, ":", args[arg])

    ### Set Output (Create if doesn't exist already) ###
    set_output(args["out"])

    ### Initialize variables ###
    automatic_selection = True
    threads = args["threads"]
    q_dict = {}
    sketches_dict = {}
    sketches = []
    sketch_info = {}
    results_dict = {}
    thresholds = {}
    error_dict = {}
    temp_dir = tempfile.mkdtemp()
    project_name = args["proj_name"]
    dir_flag = False
    mash_assembly_list = []
    max_num = int(args["max_num"])
    if args["fast"]:
        need_ref = False
    else:
        need_ref = True
    if args["mash_db"]:
        mash_db = args["mash_db"]
    if args["indir"]:
        input_dir = args["indir"]
    query_assemblies = []
    if args["snippy"]:
        if not args["genomes"]:
            error_dict[
                "snippy dir provided without genome dir, exiting"] = "Input error: "
            error(error_dict)
        if not args["reference"]:
            error_dict[
                "snippy dir provided without reference, exiting"] = "Input error: "
            error(error_dict)
        automatic_selection = False
    if args["genomes"] and args["reference"]:
        input_dir = args["genomes"]
        reference = args["reference"]
        dir_flag = True
        automatic_selection = False
    if args["genomes"] and not args["reference"]:
        error_dict[
            "Genome dir provided without reference, exiting"] = "Input error: "
        error(error_dict)
    if args["reference"] and not args["genomes"]:
        error_dict[
            "Reference provided without genome directory, exiting"] = "Input error: "
        error(error_dict)

    in_file_counter = 0
    for in_file in os.listdir(input_dir):
        in_file_path = os.path.join(input_dir, in_file)
        query_assemblies.append(in_file_path)
        in_file_counter += 1
    max_num_per_query = (max_num - in_file_counter) / in_file_counter
    query_sketch = mash_sketch_list(threads, query_assemblies, OUTPUT_DIR,
                                    project_name, temp_dir)

    if need_ref:
        ref_path = pick_reference(query_sketch, threads)

    if not args["mash_db"]:
        bmgap_data = call_bmgap_api()
        for record in bmgap_data:
            mash_assembly_list.append(bmgap_data[record]["assembly_path"])
        mash_db = mash_sketch_list(threads, mash_assembly_list, OUTPUT_DIR,
                                   project_name, temp_dir)

    if automatic_selection:
        final_genomes = pick_genomes(query_sketch, mash_db, args["threads"],
                                     int(max_num_per_query), force_max)
        if need_ref:
            final_genomes["ref"] = ref_path
            print(ref_path)
    else:
        final_genomes = {
            "all_genomes": [],
            "details": {},
            "ref": args["reference"]
        }
        for infile in os.listdir(args["genomes"]):
            for ext in fasta_extensions:
                if ext in infile:
                    infile_path = os.path.join(args["genomes"], infile)
                    if infile_path not in final_genomes["all_genomes"]:
                        final_genomes["all_genomes"].append(infile_path)
                        continue
    #pp.pprint(final_genomes)
    if not args["fast"]:
        if not args["snippy"]:
            snippy_dir = run_snippy(final_genomes, args["threads"],
                                    query_assemblies, dir_flag)
        else:
            snippy_dir = args["snippy"]
            redo_list = snippy_check(snippy_dir)
            for obj in redo_list:
                print(obj)
                for genome in os.listdir(input_dir):
                    print(genome)
                    if obj in genome:
                        print("found")
                        redo_obj = os.path.join(genome_dir, genome)
                        call_snippy(reference, redo_obj)

        call([
            "snippy-core --prefix={}_core --aformat=fasta {}/*".format(
                project_name, snippy_dir)
        ],
             shell=True)
        p2 = Popen(["mv {}_core* {}".format(project_name, snippy_dir)],
                   shell=True)
        p2.wait()
        p3 = Popen([
            "python3 {} {}/{}_core.full.aln -o {}".format(
                mask_map_script, snippy_dir, project_name, OUTPUT_DIR)
        ],
                   shell=True)
        p3.wait()
        masked_aln_file = "{}/{}_core.full_masked.aln".format(
            OUTPUT_DIR, project_name)
        partition_file = "{}/{}_core.full_partition.txt".format(
            OUTPUT_DIR, project_name)
        print("gubbins")
        p4 = Popen([
            "run_gubbins.py -c {} -i 10 -u -p {}/gubbins_masked -v -t raxml {}"
            .format(args["threads"], OUTPUT_DIR, masked_aln_file)
        ],
                   shell=True)
        p4.wait()
        gubbins_phylip_file = "{}/gubbins_masked.filtered_polymorphic_sites.phylip".format(
            OUTPUT_DIR)
        p5 = Popen([
            "python3 {} {} {}".format(adjust_size_script, gubbins_phylip_file,
                                      partition_file)
        ],
                   shell=True)
        p5.wait()
        abs_output = os.path.abspath(OUTPUT_DIR)
        print("raxml")
        p6 = Popen([
            "raxmlHPC-PTHREADS -s {} -w {} -n {}_out --asc-cor=stamatakis -q {} -m GTRGAMMAX -T {} -N autoMRE -p 6420662893125220392 -f a -x 7125452922221827660"
            .format(gubbins_phylip_file, abs_output, project_name,
                    partition_file, args["threads"])
        ],
                   shell=True)
        p6.wait()
    else:
        mash_matrix = make_mash_matrix(threads, final_genomes["all_genomes"],
                                       OUTPUT_DIR, project_name, temp_dir)
        # with open("test_out.txt","w") as f:
        # f.write(mash_matrix)
        i = 2
        matrix = []
        names = []
        firstLine = True
        mash_matrix_lines = mash_matrix.split("\n")
        for line in mash_matrix_lines:
            if line.strip() != "":
                if firstLine:
                    print(line)
                    current_names = line.split("\t")
                    for obj in current_names:
                        if len(obj) > 0:
                            names.append(obj)
                    firstLine = False
                else:
                    sub_matrix = []
                    values = line.split("\t")
                    for q in range(1, i):
                        val = float(values[q])
                        sub_matrix.append(val)
                    matrix.append(sub_matrix)
                    i += 1
        #print(names)
        #print(len(names),len(matrix))
        print("building tree")
        dm = _DistanceMatrix(names, matrix)
        constructor = DistanceTreeConstructor(method="nj")
        tree = constructor.nj(dm)
        Phylo.write([tree], "my_tree.tree", "newick")
Esempio n. 48
0
def main(argv):
	input_file=''
	title='Title'
	label_internal_nodes = False
	label_leaves = False
	out_file=''
	width=750
	out_file_xml=''
	plot_rectangular = False
	common_kmer_data_path=''
	taxonomic_names_on_leaves = False
	try:
		opts, args = getopt.getopt(argv,"h:i:lnrto:w:x:D:",["Help=","InputCommonKmerXFile=","LabelLeaves=", "LabelInternalNodes=","Rectangular=", "TaxonomicNamesOnLeaves=", "OutFile=","Width=","OutFileXML=","CommonKmerDataPath="])
	except getopt.GetoptError:
		print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
			sys.exit(2)
		elif opt in ("-i", "--InputCommonKmerXFile"):
			input_file = arg
		elif opt in ("-l", "--LabelLeaves"):
			label_leaves = True
		elif opt in ("-n","--LabelInternalNodes"):
			label_internal_nodes = True
		elif opt in ("-o", "--OutFile"):
			out_file = arg
		elif opt in ("-w", "--Width"):
			width = int(arg)
		elif opt in ("-x", "--OutFileXML"):
			out_file_xml = arg
		elif opt in ("-D", "--CommonKmerDataPath"):
			common_kmer_data_path = arg
		elif opt in ("-r", "--Rectangular"):
			plot_rectangular = True
		elif opt in ("-t", "--TaxonomicNamesOnLeaves"):
			taxonomic_names_on_leaves = True
	
	
	#Read in the x vector
	fid = open(input_file,'r')
	x = map(lambda y: float(y),fid.readlines())
	fid.close()
	
	#Normalize the x vector
	#x = map(lambda y: y/sum(x),x)
	
	#Read in the taxonomy
	taxonomy = list()
	fid = open(os.path.join(common_kmer_data_path,"Taxonomy.txt"),'r')
	for line in fid:
		taxonomy.append('_'.join(line.split()[0].split("_")[1:])) #Just take the first line of the taxonomy (erasing the taxID)
	fid.close()
	
	#Read in the basis for the ckm matrices
	x_file_names = list()
	fid = open(os.path.join(common_kmer_data_path,"FileNames.txt"),'r')
	for line in fid:
		x_file_names.append(os.path.basename(line.strip()))
	fid.close()
	
	#Read in the common kmer matrix
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-30mers.h5'),'r')
	ckm30=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-50mers.h5'),'r')
	ckm50=np.array(f['common_kmers'],dtype=np.float64)
	f.close()
	ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30))
	ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50))
	num_rows = ckm30_norm.shape[0]
	num_cols = ckm30_norm.shape[1]
	names = x_file_names
	matrix=list()
	for i in range(num_rows):
		matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)])
	
	#Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish.
	dm = _DistanceMatrix(names, matrix)
	constructor = DistanceTreeConstructor()
	tree = constructor.nj(dm)
	t=Tree(tree.format('newick'),format=1)
	#tree.format('newick')
	#Phylo.draw_ascii(tree)
	
	#Now I will put internal nodes in a certain phylogenetic distance between the root and a given node.
	#Function to insert a node at a given distance
	def insert_node(t, name_to_insert, insert_above, dist_along):
		insert_at_node = t.search_nodes(name=insert_above)[0]
		parent = (t&insert_above).up
		orig_branch_length = t.get_distance(insert_at_node,parent)
		if orig_branch_length < dist_along:
			raise ValueError("error: dist_along larger than orig_branch_length")
		removed_node = insert_at_node.detach()
		removed_node.dist = orig_branch_length - dist_along
		added_node = parent.add_child(name=name_to_insert, dist=dist_along)
		added_node.add_child(removed_node)
	
	#Function to insert a node some % along a branch
	def insert_hyp_node(t, leaf_name, percent):
		total_dist = t.get_distance(t.name,leaf_name)
		percent_dist = percent*total_dist
		child_node = (t&leaf_name)
		ancestor_node = (t&child_node.name).up
		while t.get_distance(t.name, ancestor_node) > percent_dist:
			child_node = ancestor_node
			ancestor_node = (t&child_node.name).up
		insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node))
	
	#Insert hypothetical nodes
	hyp_node_names = dict()
	cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1]
	cutoffs = map(lambda y: y**1.5,cutoffs)
	for i in range(len(x_file_names)):
		xi = x[i:len(x):len(x_file_names)]
		for j in range(1,len(cutoffs)+1):
			if xi[j]>0:
				insert_hyp_node(t, x_file_names[i], cutoffs[j-1])
				hyp_node_names[x_file_names[i]+"_"+str(cutoffs[j-1])] = [x_file_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names
				#insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j])
	
	#Now put the bubbles on the nodes
	def layout(node):
		#print(node)
		if node.is_leaf():
			if node.name in x_file_names:
				#make reconstructed bubble
				size = x[x_file_names.index(node.name)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				if taxonomic_names_on_leaves:
					nameFace = AttrFace("name", fsize=25, fgcolor='black',text_suffix="_"+taxonomy[x_file_names.index(node.name)])
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
				else:
					nameFace = AttrFace("name", fsize=25, fgcolor='black')
					faces.add_face_to_node(nameFace, node, 0, position="branch-right")
		elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x
			node_base_name = hyp_node_names[node.name][0]
			percent = hyp_node_names[node.name][1]
			if node_base_name in x_file_names:
				idx = hyp_node_names[node.name][2]
				size = x[x_file_names.index(node_base_name)+(idx+1)*len(x_file_names)]
				F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere")
				F.border.width = None
				F.opacity = 0.6
				faces.add_face_to_node(F,node, 0, position="branch-right")
				#print node
				#print size
			else:
				size=0
		else:
			size=0
		#print(size)
	
	ts = TreeStyle()
	ts.layout_fn = layout
	if plot_rectangular:
		ts.mode = "r"
	else:
		ts.mode = "c"
	ts.show_leaf_name = False
	ts.min_leaf_separation = 50

	#Export the tree to a png image
	t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	phylo.phyloxml_phylogeny.set_name(title)
	project.add_phylogeny(phylo)
	project.export(open(out_file_xml,'w'))